Remove source

Browse files

Files changed (14) hide show

README.md +2 -0
build.toml +0 -20
deformable_detr/ms_deform_attn_cuda.cu +0 -158
deformable_detr/ms_deform_attn_cuda.cuh +0 -1467
deformable_detr/ms_deform_attn_cuda.h +0 -46
deformable_detr/ms_deform_im2col_cuda.cuh +0 -1327
flake.lock +0 -169
flake.nix +0 -17
torch-ext/deformable_detr/__init__.py +0 -46
torch-ext/deformable_detr/layers.py +0 -84
torch-ext/ms_deform_attn_cpu.cpp +0 -40
torch-ext/ms_deform_attn_cpu.h +0 -32
torch-ext/torch_binding.cpp +0 -19
torch-ext/torch_binding.h +0 -16

README.md CHANGED Viewed

@@ -7,3 +7,5 @@ tags:
 ![Status](https://hubwebhook.dholtz.com/shield?repo=kernels-community/deformable-detr)
 ## deformable-detr

 ![Status](https://hubwebhook.dholtz.com/shield?repo=kernels-community/deformable-detr)
 ## deformable-detr
+Kernel source: https://github.com/huggingface/kernels-community/tree/main/deformable-detr

build.toml DELETED Viewed

@@ -1,20 +0,0 @@
-[general]
-name = "deformable_detr"
-universal = false
-[torch]
-src = [
-    "torch-ext/torch_binding.cpp",
-    "torch-ext/torch_binding.h",
-]
-[kernel.activation]
-backend = "cuda"
-depends = ["torch"]
-include = ["."]
-src = [
-    "deformable_detr/ms_deform_attn_cuda.cu",
-    "deformable_detr/ms_deform_im2col_cuda.cuh",
-    "deformable_detr/ms_deform_attn_cuda.cuh",
-    "deformable_detr/ms_deform_attn_cuda.h",
-]

deformable_detr/ms_deform_attn_cuda.cu DELETED Viewed

@@ -1,158 +0,0 @@
-/*!
-**************************************************************************************************
-* Deformable DETR
-* Copyright (c) 2020 SenseTime. All Rights Reserved.
-* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-**************************************************************************************************
-* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-**************************************************************************************************
-*/
-#include <vector>
-#include "deformable_detr/ms_deform_im2col_cuda.cuh"
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <torch/all.h>
-at::Tensor ms_deform_attn_cuda_forward(
-    const at::Tensor &value,
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const int64_t im2col_step)
-{
-    at::DeviceGuard guard(value.device());
-    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
-    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
-    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
-    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
-    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
-    AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
-    AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
-    AT_ASSERTM(level_start_index.is_cuda(), "level_start_index must be a CUDA tensor");
-    AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
-    AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
-    const int batch = value.size(0);
-    const int spatial_size = value.size(1);
-    const int num_heads = value.size(2);
-    const int channels = value.size(3);
-    const int num_levels = spatial_shapes.size(0);
-    const int num_query = sampling_loc.size(1);
-    const int num_point = sampling_loc.size(4);
-    const int im2col_step_ = std::min(batch, static_cast<int>(im2col_step));
-    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
-    auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
-    const int batch_n = im2col_step_;
-    auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
-    auto per_value_size = spatial_size * num_heads * channels;
-    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
-    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
-    for (int n = 0; n < batch/im2col_step_; ++n)
-    {
-        auto columns = output_n.select(0, n);
-        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, value.scalar_type(), "ms_deform_attn_forward_cuda", ([&] {
-            ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
-                value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
-                spatial_shapes.data_ptr<int64_t>(),
-                level_start_index.data_ptr<int64_t>(),
-                sampling_loc.data_ptr<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
-                attn_weight.data_ptr<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
-                batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
-                columns.data_ptr<scalar_t>());
-        }));
-    }
-    output = output.view({batch, num_query, num_heads*channels});
-    return output;
-}
-std::vector<at::Tensor> ms_deform_attn_cuda_backward(
-    const at::Tensor &value,
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const at::Tensor &grad_output,
-    const int64_t im2col_step)
-{
-    at::DeviceGuard guard(value.device());
-    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
-    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
-    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
-    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
-    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
-    AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
-    AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
-    AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
-    AT_ASSERTM(level_start_index.is_cuda(), "level_start_index must be a CUDA tensor");
-    AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
-    AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
-    AT_ASSERTM(grad_output.is_cuda(), "grad_output must be a CUDA tensor");
-    const int batch = value.size(0);
-    const int spatial_size = value.size(1);
-    const int num_heads = value.size(2);
-    const int channels = value.size(3);
-    const int num_levels = spatial_shapes.size(0);
-    const int num_query = sampling_loc.size(1);
-    const int num_point = sampling_loc.size(4);
-    const int im2col_step_ = std::min(batch, static_cast<int>(im2col_step));
-    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
-    auto grad_value = at::zeros_like(value);
-    auto grad_sampling_loc = at::zeros_like(sampling_loc);
-    auto grad_attn_weight = at::zeros_like(attn_weight);
-    const int batch_n = im2col_step_;
-    auto per_value_size = spatial_size * num_heads * channels;
-    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
-    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
-    auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
-    for (int n = 0; n < batch/im2col_step_; ++n)
-    {
-        auto grad_output_g = grad_output_n.select(0, n);
-        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, value.scalar_type(), "ms_deform_attn_backward_cuda", ([&] {
-            ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
-                                    grad_output_g.data_ptr<scalar_t>(),
-                                    value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
-                                    spatial_shapes.data_ptr<int64_t>(),
-                                    level_start_index.data_ptr<int64_t>(),
-                                    sampling_loc.data_ptr<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
-                                    attn_weight.data_ptr<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
-                                    batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
-                                    grad_value.data_ptr<scalar_t>() +  n * im2col_step_ * per_value_size,
-                                    grad_sampling_loc.data_ptr<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
-                                    grad_attn_weight.data_ptr<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
-        }));
-    }
-    return {
-        grad_value, grad_sampling_loc, grad_attn_weight
-    };
-}

deformable_detr/ms_deform_attn_cuda.cuh DELETED Viewed

@@ -1,1467 +0,0 @@
-/*!
-**************************************************************************************************
-* Deformable DETR
-* Copyright (c) 2020 SenseTime. All Rights Reserved.
-* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-**************************************************************************************************
-* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-**************************************************************************************************
-*/
-#include <vector>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cstdio>
-#include <algorithm>
-#include <cstring>
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <THC/THCAtomics.cuh>
-#define CUDA_KERNEL_LOOP(i, n)                          \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
-      i < (n);                                          \
-      i += blockDim.x * gridDim.x)
-at::Tensor ms_deform_attn_cuda_forward(
-    const at::Tensor &value,
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const int im2col_step)
-{
-    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
-    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
-    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
-    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
-    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
-    AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
-    AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
-    AT_ASSERTM(level_start_index.is_cuda(), "level_start_index must be a CUDA tensor");
-    AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
-    AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
-    const int batch = value.size(0);
-    const int spatial_size = value.size(1);
-    const int num_heads = value.size(2);
-    const int channels = value.size(3);
-    const int num_levels = spatial_shapes.size(0);
-    const int num_query = sampling_loc.size(1);
-    const int num_point = sampling_loc.size(4);
-    const int im2col_step_ = std::min(batch, im2col_step);
-    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
-    auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
-    const int batch_n = im2col_step_;
-    auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
-    auto per_value_size = spatial_size * num_heads * channels;
-    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
-    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
-    for (int n = 0; n < batch/im2col_step_; ++n)
-    {
-        auto columns = output_n.select(0, n);
-        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, value.scalar_type(), "ms_deform_attn_forward_cuda", ([&] {
-            ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
-                value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
-                spatial_shapes.data_ptr<int64_t>(),
-                level_start_index.data_ptr<int64_t>(),
-                sampling_loc.data_ptr<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
-                attn_weight.data_ptr<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
-                batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
-                columns.data_ptr<scalar_t>());
-        }));
-    }
-    output = output.view({batch, num_query, num_heads*channels});
-    return output;
-}
-std::vector<at::Tensor> ms_deform_attn_cuda_backward(
-    const at::Tensor &value,
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const at::Tensor &grad_output,
-    const int im2col_step)
-{
-    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
-    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
-    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
-    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
-    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
-    AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
-    AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
-    AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
-    AT_ASSERTM(level_start_index.is_cuda(), "level_start_index must be a CUDA tensor");
-    AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
-    AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
-    AT_ASSERTM(grad_output.is_cuda(), "grad_output must be a CUDA tensor");
-    const int batch = value.size(0);
-    const int spatial_size = value.size(1);
-    const int num_heads = value.size(2);
-    const int channels = value.size(3);
-    const int num_levels = spatial_shapes.size(0);
-    const int num_query = sampling_loc.size(1);
-    const int num_point = sampling_loc.size(4);
-    const int im2col_step_ = std::min(batch, im2col_step);
-    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
-    auto grad_value = at::zeros_like(value);
-    auto grad_sampling_loc = at::zeros_like(sampling_loc);
-    auto grad_attn_weight = at::zeros_like(attn_weight);
-    const int batch_n = im2col_step_;
-    auto per_value_size = spatial_size * num_heads * channels;
-    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
-    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
-    auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
-    for (int n = 0; n < batch/im2col_step_; ++n)
-    {
-        auto grad_output_g = grad_output_n.select(0, n);
-        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, value.scalar_type(), "ms_deform_attn_backward_cuda", ([&] {
-            ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
-                                    grad_output_g.data_ptr<scalar_t>(),
-                                    value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
-                                    spatial_shapes.data_ptr<int64_t>(),
-                                    level_start_index.data_ptr<int64_t>(),
-                                    sampling_loc.data_ptr<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
-                                    attn_weight.data_ptr<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
-                                    batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
-                                    grad_value.data_ptr<scalar_t>() +  n * im2col_step_ * per_value_size,
-                                    grad_sampling_loc.data_ptr<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
-                                    grad_attn_weight.data_ptr<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
-        }));
-    }
-    return {
-        grad_value, grad_sampling_loc, grad_attn_weight
-    };
-}
-const int CUDA_NUM_THREADS = 1024;
-inline int GET_BLOCKS(const int N, const int num_threads)
-{
-  return (N + num_threads - 1) / num_threads;
-}
-template <typename scalar_t>
-__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data,
-                                                   const int &height, const int &width, const int &nheads, const int &channels,
-                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c)
-{
-  const int h_low = floor(h);
-  const int w_low = floor(w);
-  const int h_high = h_low + 1;
-  const int w_high = w_low + 1;
-  const scalar_t lh = h - h_low;
-  const scalar_t lw = w - w_low;
-  const scalar_t hh = 1 - lh, hw = 1 - lw;
-  const int w_stride = nheads * channels;
-  const int h_stride = width * w_stride;
-  const int h_low_ptr_offset = h_low * h_stride;
-  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
-  const int w_low_ptr_offset = w_low * w_stride;
-  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
-  const int base_ptr = m * channels + c;
-  scalar_t v1 = 0;
-  if (h_low >= 0 && w_low >= 0)
-  {
-    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
-    v1 = bottom_data[ptr1];
-  }
-  scalar_t v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1)
-  {
-    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
-    v2 = bottom_data[ptr2];
-  }
-  scalar_t v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0)
-  {
-    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
-    v3 = bottom_data[ptr3];
-  }
-  scalar_t v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1)
-  {
-    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
-    v4 = bottom_data[ptr4];
-  }
-  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return val;
-}
-template <typename scalar_t>
-__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data,
-                                                   const int &height, const int &width, const int &nheads, const int &channels,
-                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
-                                                   const scalar_t &top_grad,
-                                                   const scalar_t &attn_weight,
-                                                   scalar_t* &grad_value,
-                                                   scalar_t* grad_sampling_loc,
-                                                   scalar_t* grad_attn_weight)
-{
-  const int h_low = floor(h);
-  const int w_low = floor(w);
-  const int h_high = h_low + 1;
-  const int w_high = w_low + 1;
-  const scalar_t lh = h - h_low;
-  const scalar_t lw = w - w_low;
-  const scalar_t hh = 1 - lh, hw = 1 - lw;
-  const int w_stride = nheads * channels;
-  const int h_stride = width * w_stride;
-  const int h_low_ptr_offset = h_low * h_stride;
-  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
-  const int w_low_ptr_offset = w_low * w_stride;
-  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
-  const int base_ptr = m * channels + c;
-  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-  const scalar_t top_grad_value = top_grad * attn_weight;
-  scalar_t grad_h_weight = 0, grad_w_weight = 0;
-  scalar_t v1 = 0;
-  if (h_low >= 0 && w_low >= 0)
-  {
-    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
-    v1 = bottom_data[ptr1];
-    grad_h_weight -= hw * v1;
-    grad_w_weight -= hh * v1;
-    atomicAdd(grad_value+ptr1, w1*top_grad_value);
-  }
-  scalar_t v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1)
-  {
-    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
-    v2 = bottom_data[ptr2];
-    grad_h_weight -= lw * v2;
-    grad_w_weight += hh * v2;
-    atomicAdd(grad_value+ptr2, w2*top_grad_value);
-  }
-  scalar_t v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0)
-  {
-    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
-    v3 = bottom_data[ptr3];
-    grad_h_weight += hw * v3;
-    grad_w_weight -= lh * v3;
-    atomicAdd(grad_value+ptr3, w3*top_grad_value);
-  }
-  scalar_t v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1)
-  {
-    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
-    v4 = bottom_data[ptr4];
-    grad_h_weight += lw * v4;
-    grad_w_weight += lh * v4;
-    atomicAdd(grad_value+ptr4, w4*top_grad_value);
-  }
-  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  *grad_attn_weight = top_grad * val;
-  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
-  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
-}
-template <typename scalar_t>
-__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data,
-                                                   const int &height, const int &width, const int &nheads, const int &channels,
-                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
-                                                   const scalar_t &top_grad,
-                                                   const scalar_t &attn_weight,
-                                                   scalar_t* &grad_value,
-                                                   scalar_t* grad_sampling_loc,
-                                                   scalar_t* grad_attn_weight)
-{
-  const int h_low = floor(h);
-  const int w_low = floor(w);
-  const int h_high = h_low + 1;
-  const int w_high = w_low + 1;
-  const scalar_t lh = h - h_low;
-  const scalar_t lw = w - w_low;
-  const scalar_t hh = 1 - lh, hw = 1 - lw;
-  const int w_stride = nheads * channels;
-  const int h_stride = width * w_stride;
-  const int h_low_ptr_offset = h_low * h_stride;
-  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
-  const int w_low_ptr_offset = w_low * w_stride;
-  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
-  const int base_ptr = m * channels + c;
-  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-  const scalar_t top_grad_value = top_grad * attn_weight;
-  scalar_t grad_h_weight = 0, grad_w_weight = 0;
-  scalar_t v1 = 0;
-  if (h_low >= 0 && w_low >= 0)
-  {
-    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
-    v1 = bottom_data[ptr1];
-    grad_h_weight -= hw * v1;
-    grad_w_weight -= hh * v1;
-    atomicAdd(grad_value+ptr1, w1*top_grad_value);
-  }
-  scalar_t v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1)
-  {
-    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
-    v2 = bottom_data[ptr2];
-    grad_h_weight -= lw * v2;
-    grad_w_weight += hh * v2;
-    atomicAdd(grad_value+ptr2, w2*top_grad_value);
-  }
-  scalar_t v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0)
-  {
-    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
-    v3 = bottom_data[ptr3];
-    grad_h_weight += hw * v3;
-    grad_w_weight -= lh * v3;
-    atomicAdd(grad_value+ptr3, w3*top_grad_value);
-  }
-  scalar_t v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1)
-  {
-    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
-    v4 = bottom_data[ptr4];
-    grad_h_weight += lw * v4;
-    grad_w_weight += lh * v4;
-    atomicAdd(grad_value+ptr4, w4*top_grad_value);
-  }
-  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  atomicAdd(grad_attn_weight, top_grad * val);
-  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
-  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
-}
-template <typename scalar_t>
-__global__ void ms_deformable_im2col_gpu_kernel(const int n,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index,
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size,
-                                                const int spatial_size,
-                                                const int num_heads,
-                                                const int channels,
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *data_col)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp;
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    [[maybe_unused]] const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-    scalar_t *data_col_ptr = data_col + index;
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-    scalar_t col = 0;
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;
-        }
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-      }
-    }
-    *data_col_ptr = col;
-  }
-}
-template <typename scalar_t, unsigned int blockSize>
-__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index,
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size,
-                                                const int spatial_size,
-                                                const int num_heads,
-                                                const int channels,
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
-    __shared__ scalar_t cache_grad_attn_weight[blockSize];
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp;
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    [[maybe_unused]] const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-    const scalar_t top_grad = grad_col[index];
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight+threadIdx.x)=0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr,
-            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
-        }
-        __syncthreads();
-        if (tid == 0)
-        {
-          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
-          int sid=2;
-          for (unsigned int tid = 1; tid < blockSize; ++tid)
-          {
-            _grad_w += cache_grad_sampling_loc[sid];
-            _grad_h += cache_grad_sampling_loc[sid + 1];
-            _grad_a += cache_grad_attn_weight[tid];
-            sid += 2;
-          }
-          *grad_sampling_loc = _grad_w;
-          *(grad_sampling_loc + 1) = _grad_h;
-          *grad_attn_weight = _grad_a;
-        }
-        __syncthreads();
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-template <typename scalar_t, unsigned int blockSize>
-__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index,
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size,
-                                                const int spatial_size,
-                                                const int num_heads,
-                                                const int channels,
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
-    __shared__ scalar_t cache_grad_attn_weight[blockSize];
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp;
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    [[maybe_unused]] const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-    const scalar_t top_grad = grad_col[index];
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight+threadIdx.x)=0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr,
-            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
-        }
-        __syncthreads();
-        for (unsigned int s=blockSize/2; s>0; s>>=1)
-        {
-          if (tid < s) {
-            const unsigned int xid1 = tid << 1;
-            const unsigned int xid2 = (tid + s) << 1;
-            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
-            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
-            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
-          }
-          __syncthreads();
-        }
-        if (tid == 0)
-        {
-          *grad_sampling_loc = cache_grad_sampling_loc[0];
-          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
-          *grad_attn_weight = cache_grad_attn_weight[0];
-        }
-        __syncthreads();
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-template <typename scalar_t>
-__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index,
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size,
-                                                const int spatial_size,
-                                                const int num_heads,
-                                                const int channels,
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    extern __shared__ int _s[];
-    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
-    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp;
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    [[maybe_unused]] const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-    const scalar_t top_grad = grad_col[index];
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight+threadIdx.x)=0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr,
-            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
-        }
-        __syncthreads();
-        if (tid == 0)
-        {
-          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
-          int sid=2;
-          for (unsigned int tid = 1; tid < blockDim.x; ++tid)
-          {
-            _grad_w += cache_grad_sampling_loc[sid];
-            _grad_h += cache_grad_sampling_loc[sid + 1];
-            _grad_a += cache_grad_attn_weight[tid];
-            sid += 2;
-          }
-          *grad_sampling_loc = _grad_w;
-          *(grad_sampling_loc + 1) = _grad_h;
-          *grad_attn_weight = _grad_a;
-        }
-        __syncthreads();
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-template <typename scalar_t>
-__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index,
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size,
-                                                const int spatial_size,
-                                                const int num_heads,
-                                                const int channels,
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    extern __shared__ int _s[];
-    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
-    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp;
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    [[maybe_unused]] const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-    const scalar_t top_grad = grad_col[index];
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight+threadIdx.x)=0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr,
-            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
-        }
-        __syncthreads();
-        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
-        {
-          if (tid < s) {
-            const unsigned int xid1 = tid << 1;
-            const unsigned int xid2 = (tid + s) << 1;
-            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
-            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
-            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
-            if (tid + (s << 1) < spre)
-            {
-              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
-              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
-              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
-            }
-          }
-          __syncthreads();
-        }
-        if (tid == 0)
-        {
-          *grad_sampling_loc = cache_grad_sampling_loc[0];
-          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
-          *grad_attn_weight = cache_grad_attn_weight[0];
-        }
-        __syncthreads();
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-template <typename scalar_t>
-__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index,
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size,
-                                                const int spatial_size,
-                                                const int num_heads,
-                                                const int channels,
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    extern __shared__ int _s[];
-    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
-    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp;
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    [[maybe_unused]] const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-    const scalar_t top_grad = grad_col[index];
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight+threadIdx.x)=0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr,
-            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
-        }
-        __syncthreads();
-        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
-        {
-          if (tid < s) {
-            const unsigned int xid1 = tid << 1;
-            const unsigned int xid2 = (tid + s) << 1;
-            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
-            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
-            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
-            if (tid + (s << 1) < spre)
-            {
-              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
-              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
-              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
-            }
-          }
-          __syncthreads();
-        }
-        if (tid == 0)
-        {
-          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
-          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
-          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
-        }
-        __syncthreads();
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-template <typename scalar_t>
-__global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index,
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size,
-                                                const int spatial_size,
-                                                const int num_heads,
-                                                const int channels,
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp;
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    [[maybe_unused]] const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-    const scalar_t top_grad = grad_col[index];
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear_gm(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr,
-            grad_sampling_loc, grad_attn_weight);
-        }
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-template <typename scalar_t>
-void ms_deformable_im2col_cuda(cudaStream_t stream,
-                              const scalar_t* data_value,
-                              const int64_t* data_spatial_shapes,
-                              const int64_t* data_level_start_index,
-                              const scalar_t* data_sampling_loc,
-                              const scalar_t* data_attn_weight,
-                              const int batch_size,
-                              const int spatial_size,
-                              const int num_heads,
-                              const int channels,
-                              const int num_levels,
-                              const int num_query,
-                              const int num_point,
-                              scalar_t* data_col)
-{
-  const int num_kernels = batch_size * num_query * num_heads * channels;
-  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
-  const int num_threads = CUDA_NUM_THREADS;
-  ms_deformable_im2col_gpu_kernel<scalar_t>
-      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-          0, stream>>>(
-      num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight,
-      batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess)
-  {
-    printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
-  }
-}
-template <typename scalar_t>
-void ms_deformable_col2im_cuda(cudaStream_t stream,
-                              const scalar_t* grad_col,
-                              const scalar_t* data_value,
-                              const int64_t * data_spatial_shapes,
-                              const int64_t * data_level_start_index,
-                              const scalar_t * data_sampling_loc,
-                              const scalar_t * data_attn_weight,
-                              const int batch_size,
-                              const int spatial_size,
-                              const int num_heads,
-                              const int channels,
-                              const int num_levels,
-                              const int num_query,
-                              const int num_point,
-                              scalar_t* grad_value,
-                              scalar_t* grad_sampling_loc,
-                              scalar_t* grad_attn_weight)
-{
-  const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels;
-  const int num_kernels = batch_size * num_query * num_heads * channels;
-  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
-  if (channels > 1024)
-  {
-    if ((channels & 1023) == 0)
-    {
-      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-              num_threads*3*sizeof(scalar_t), stream>>>(
-                        num_kernels,
-                        grad_col,
-                        data_value,
-                        data_spatial_shapes,
-                        data_level_start_index,
-                        data_sampling_loc,
-                        data_attn_weight,
-                        batch_size,
-                        spatial_size,
-                        num_heads,
-                        channels,
-                        num_levels,
-                        num_query,
-                        num_point,
-                        grad_value,
-                        grad_sampling_loc,
-                        grad_attn_weight);
-    }
-    else
-    {
-      ms_deformable_col2im_gpu_kernel_gm<scalar_t>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels,
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index,
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size,
-                      spatial_size,
-                      num_heads,
-                      channels,
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-    }
-  }
-  else{
-    switch(channels)
-    {
-      case 1:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 1>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels,
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index,
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size,
-                      spatial_size,
-                      num_heads,
-                      channels,
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 2:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 2>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels,
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index,
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size,
-                      spatial_size,
-                      num_heads,
-                      channels,
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 4:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 4>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels,
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index,
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size,
-                      spatial_size,
-                      num_heads,
-                      channels,
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 8:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 8>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels,
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index,
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size,
-                      spatial_size,
-                      num_heads,
-                      channels,
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 16:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 16>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels,
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index,
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size,
-                      spatial_size,
-                      num_heads,
-                      channels,
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 32:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 32>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels,
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index,
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size,
-                      spatial_size,
-                      num_heads,
-                      channels,
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 64:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 64>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels,
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index,
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size,
-                      spatial_size,
-                      num_heads,
-                      channels,
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 128:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 128>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels,
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index,
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size,
-                      spatial_size,
-                      num_heads,
-                      channels,
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 256:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 256>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels,
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index,
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size,
-                      spatial_size,
-                      num_heads,
-                      channels,
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 512:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 512>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels,
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index,
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size,
-                      spatial_size,
-                      num_heads,
-                      channels,
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 1024:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 1024>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels,
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index,
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size,
-                      spatial_size,
-                      num_heads,
-                      channels,
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      default:
-        if (channels < 64)
-        {
-          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-              num_threads*3*sizeof(scalar_t), stream>>>(
-                        num_kernels,
-                        grad_col,
-                        data_value,
-                        data_spatial_shapes,
-                        data_level_start_index,
-                        data_sampling_loc,
-                        data_attn_weight,
-                        batch_size,
-                        spatial_size,
-                        num_heads,
-                        channels,
-                        num_levels,
-                        num_query,
-                        num_point,
-                        grad_value,
-                        grad_sampling_loc,
-                        grad_attn_weight);
-        }
-        else
-        {
-          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-              num_threads*3*sizeof(scalar_t), stream>>>(
-                        num_kernels,
-                        grad_col,
-                        data_value,
-                        data_spatial_shapes,
-                        data_level_start_index,
-                        data_sampling_loc,
-                        data_attn_weight,
-                        batch_size,
-                        spatial_size,
-                        num_heads,
-                        channels,
-                        num_levels,
-                        num_query,
-                        num_point,
-                        grad_value,
-                        grad_sampling_loc,
-                        grad_attn_weight);
-        }
-    }
-  }
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess)
-  {
-    printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
-  }
-}

deformable_detr/ms_deform_attn_cuda.h DELETED Viewed

@@ -1,46 +0,0 @@
-/*!
-**************************************************************************************************
-* Deformable DETR
-* Copyright (c) 2020 SenseTime. All Rights Reserved.
-* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-**************************************************************************************************
-* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-**************************************************************************************************
-*/
-#pragma once
-#include <torch/torch.h>
-at::Tensor ms_deform_attn_cuda_forward(
-    const at::Tensor &value,
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const int im2col_step);
-at::Tensor ms_deform_attn_cuda_forward_bf16(
-    const at::Tensor &value,
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const int im2col_step);
-std::vector<at::Tensor> ms_deform_attn_cuda_backward(
-    const at::Tensor &value,
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const at::Tensor &grad_output,
-    const int im2col_step);
-std::vector<at::Tensor> ms_deform_attn_cuda_backward_bf16(
-    const at::Tensor &value,
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const at::Tensor &grad_output,
-    const int im2col_step);

deformable_detr/ms_deform_im2col_cuda.cuh DELETED Viewed

@@ -1,1327 +0,0 @@
-/*!
-**************************************************************************
-* Deformable DETR
-* Copyright (c) 2020 SenseTime. All Rights Reserved.
-* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-**************************************************************************
-* Modified from DCN (https://github.com/msracver/Deformable-ConvNets)
-* Copyright (c) 2018 Microsoft
-**************************************************************************
-*/
-#include <cstdio>
-#include <algorithm>
-#include <cstring>
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <THC/THCAtomics.cuh>
-#define CUDA_KERNEL_LOOP(i, n)                          \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
-      i < (n);                                          \
-      i += blockDim.x * gridDim.x)
-const int CUDA_NUM_THREADS = 1024;
-inline int GET_BLOCKS(const int N, const int num_threads)
-{
-  return (N + num_threads - 1) / num_threads;
-}
-template <typename scalar_t>
-__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data,
-                                                   const int &height, const int &width, const int &nheads, const int &channels,
-                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c)
-{
-  const int h_low = floor(h);
-  const int w_low = floor(w);
-  const int h_high = h_low + 1;
-  const int w_high = w_low + 1;
-  const scalar_t lh = h - h_low;
-  const scalar_t lw = w - w_low;
-  const scalar_t hh = 1 - lh, hw = 1 - lw;
-  const int w_stride = nheads * channels;
-  const int h_stride = width * w_stride;
-  const int h_low_ptr_offset = h_low * h_stride;
-  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
-  const int w_low_ptr_offset = w_low * w_stride;
-  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
-  const int base_ptr = m * channels + c;
-  scalar_t v1 = 0;
-  if (h_low >= 0 && w_low >= 0)
-  {
-    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
-    v1 = bottom_data[ptr1];
-  }
-  scalar_t v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1)
-  {
-    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
-    v2 = bottom_data[ptr2];
-  }
-  scalar_t v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0)
-  {
-    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
-    v3 = bottom_data[ptr3];
-  }
-  scalar_t v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1)
-  {
-    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
-    v4 = bottom_data[ptr4];
-  }
-  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return val;
-}
-template <typename scalar_t>
-__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data,
-                                                   const int &height, const int &width, const int &nheads, const int &channels,
-                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
-                                                   const scalar_t &top_grad,
-                                                   const scalar_t &attn_weight,
-                                                   scalar_t* &grad_value,
-                                                   scalar_t* grad_sampling_loc,
-                                                   scalar_t* grad_attn_weight)
-{
-  const int h_low = floor(h);
-  const int w_low = floor(w);
-  const int h_high = h_low + 1;
-  const int w_high = w_low + 1;
-  const scalar_t lh = h - h_low;
-  const scalar_t lw = w - w_low;
-  const scalar_t hh = 1 - lh, hw = 1 - lw;
-  const int w_stride = nheads * channels;
-  const int h_stride = width * w_stride;
-  const int h_low_ptr_offset = h_low * h_stride;
-  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
-  const int w_low_ptr_offset = w_low * w_stride;
-  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
-  const int base_ptr = m * channels + c;
-  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-  const scalar_t top_grad_value = top_grad * attn_weight;
-  scalar_t grad_h_weight = 0, grad_w_weight = 0;
-  scalar_t v1 = 0;
-  if (h_low >= 0 && w_low >= 0)
-  {
-    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
-    v1 = bottom_data[ptr1];
-    grad_h_weight -= hw * v1;
-    grad_w_weight -= hh * v1;
-    atomicAdd(grad_value+ptr1, w1*top_grad_value);
-  }
-  scalar_t v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1)
-  {
-    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
-    v2 = bottom_data[ptr2];
-    grad_h_weight -= lw * v2;
-    grad_w_weight += hh * v2;
-    atomicAdd(grad_value+ptr2, w2*top_grad_value);
-  }
-  scalar_t v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0)
-  {
-    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
-    v3 = bottom_data[ptr3];
-    grad_h_weight += hw * v3;
-    grad_w_weight -= lh * v3;
-    atomicAdd(grad_value+ptr3, w3*top_grad_value);
-  }
-  scalar_t v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1)
-  {
-    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
-    v4 = bottom_data[ptr4];
-    grad_h_weight += lw * v4;
-    grad_w_weight += lh * v4;
-    atomicAdd(grad_value+ptr4, w4*top_grad_value);
-  }
-  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  *grad_attn_weight = top_grad * val;
-  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
-  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
-}
-template <typename scalar_t>
-__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data,
-                                                   const int &height, const int &width, const int &nheads, const int &channels,
-                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
-                                                   const scalar_t &top_grad,
-                                                   const scalar_t &attn_weight,
-                                                   scalar_t* &grad_value,
-                                                   scalar_t* grad_sampling_loc,
-                                                   scalar_t* grad_attn_weight)
-{
-  const int h_low = floor(h);
-  const int w_low = floor(w);
-  const int h_high = h_low + 1;
-  const int w_high = w_low + 1;
-  const scalar_t lh = h - h_low;
-  const scalar_t lw = w - w_low;
-  const scalar_t hh = 1 - lh, hw = 1 - lw;
-  const int w_stride = nheads * channels;
-  const int h_stride = width * w_stride;
-  const int h_low_ptr_offset = h_low * h_stride;
-  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
-  const int w_low_ptr_offset = w_low * w_stride;
-  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
-  const int base_ptr = m * channels + c;
-  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-  const scalar_t top_grad_value = top_grad * attn_weight;
-  scalar_t grad_h_weight = 0, grad_w_weight = 0;
-  scalar_t v1 = 0;
-  if (h_low >= 0 && w_low >= 0)
-  {
-    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
-    v1 = bottom_data[ptr1];
-    grad_h_weight -= hw * v1;
-    grad_w_weight -= hh * v1;
-    atomicAdd(grad_value+ptr1, w1*top_grad_value);
-  }
-  scalar_t v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1)
-  {
-    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
-    v2 = bottom_data[ptr2];
-    grad_h_weight -= lw * v2;
-    grad_w_weight += hh * v2;
-    atomicAdd(grad_value+ptr2, w2*top_grad_value);
-  }
-  scalar_t v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0)
-  {
-    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
-    v3 = bottom_data[ptr3];
-    grad_h_weight += hw * v3;
-    grad_w_weight -= lh * v3;
-    atomicAdd(grad_value+ptr3, w3*top_grad_value);
-  }
-  scalar_t v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1)
-  {
-    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
-    v4 = bottom_data[ptr4];
-    grad_h_weight += lw * v4;
-    grad_w_weight += lh * v4;
-    atomicAdd(grad_value+ptr4, w4*top_grad_value);
-  }
-  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  atomicAdd(grad_attn_weight, top_grad * val);
-  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
-  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
-}
-template <typename scalar_t>
-__global__ void ms_deformable_im2col_gpu_kernel(const int n,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index,
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size,
-                                                const int spatial_size,
-                                                const int num_heads,
-                                                const int channels,
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *data_col)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp;
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    [[maybe_unused]] const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-    scalar_t *data_col_ptr = data_col + index;
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-    scalar_t col = 0;
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;
-        }
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-      }
-    }
-    *data_col_ptr = col;
-  }
-}
-template <typename scalar_t, unsigned int blockSize>
-__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index,
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size,
-                                                const int spatial_size,
-                                                const int num_heads,
-                                                const int channels,
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
-    __shared__ scalar_t cache_grad_attn_weight[blockSize];
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp;
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    [[maybe_unused]] const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-    const scalar_t top_grad = grad_col[index];
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight+threadIdx.x)=0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr,
-            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
-        }
-        __syncthreads();
-        if (tid == 0)
-        {
-          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
-          int sid=2;
-          for (unsigned int tid = 1; tid < blockSize; ++tid)
-          {
-            _grad_w += cache_grad_sampling_loc[sid];
-            _grad_h += cache_grad_sampling_loc[sid + 1];
-            _grad_a += cache_grad_attn_weight[tid];
-            sid += 2;
-          }
-          *grad_sampling_loc = _grad_w;
-          *(grad_sampling_loc + 1) = _grad_h;
-          *grad_attn_weight = _grad_a;
-        }
-        __syncthreads();
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-template <typename scalar_t, unsigned int blockSize>
-__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index,
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size,
-                                                const int spatial_size,
-                                                const int num_heads,
-                                                const int channels,
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
-    __shared__ scalar_t cache_grad_attn_weight[blockSize];
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp;
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    [[maybe_unused]] const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-    const scalar_t top_grad = grad_col[index];
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight+threadIdx.x)=0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr,
-            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
-        }
-        __syncthreads();
-        for (unsigned int s=blockSize/2; s>0; s>>=1)
-        {
-          if (tid < s) {
-            const unsigned int xid1 = tid << 1;
-            const unsigned int xid2 = (tid + s) << 1;
-            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
-            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
-            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
-          }
-          __syncthreads();
-        }
-        if (tid == 0)
-        {
-          *grad_sampling_loc = cache_grad_sampling_loc[0];
-          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
-          *grad_attn_weight = cache_grad_attn_weight[0];
-        }
-        __syncthreads();
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-template <typename scalar_t>
-__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index,
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size,
-                                                const int spatial_size,
-                                                const int num_heads,
-                                                const int channels,
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    extern __shared__ int _s[];
-    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
-    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp;
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    [[maybe_unused]] const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-    const scalar_t top_grad = grad_col[index];
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight+threadIdx.x)=0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr,
-            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
-        }
-        __syncthreads();
-        if (tid == 0)
-        {
-          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
-          int sid=2;
-          for (unsigned int tid = 1; tid < blockDim.x; ++tid)
-          {
-            _grad_w += cache_grad_sampling_loc[sid];
-            _grad_h += cache_grad_sampling_loc[sid + 1];
-            _grad_a += cache_grad_attn_weight[tid];
-            sid += 2;
-          }
-          *grad_sampling_loc = _grad_w;
-          *(grad_sampling_loc + 1) = _grad_h;
-          *grad_attn_weight = _grad_a;
-        }
-        __syncthreads();
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-template <typename scalar_t>
-__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index,
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size,
-                                                const int spatial_size,
-                                                const int num_heads,
-                                                const int channels,
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    extern __shared__ int _s[];
-    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
-    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp;
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    [[maybe_unused]] const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-    const scalar_t top_grad = grad_col[index];
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight+threadIdx.x)=0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr,
-            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
-        }
-        __syncthreads();
-        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
-        {
-          if (tid < s) {
-            const unsigned int xid1 = tid << 1;
-            const unsigned int xid2 = (tid + s) << 1;
-            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
-            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
-            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
-            if (tid + (s << 1) < spre)
-            {
-              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
-              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
-              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
-            }
-          }
-          __syncthreads();
-        }
-        if (tid == 0)
-        {
-          *grad_sampling_loc = cache_grad_sampling_loc[0];
-          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
-          *grad_attn_weight = cache_grad_attn_weight[0];
-        }
-        __syncthreads();
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-template <typename scalar_t>
-__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index,
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size,
-                                                const int spatial_size,
-                                                const int num_heads,
-                                                const int channels,
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    extern __shared__ int _s[];
-    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
-    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp;
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    [[maybe_unused]] const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-    const scalar_t top_grad = grad_col[index];
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight+threadIdx.x)=0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr,
-            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
-        }
-        __syncthreads();
-        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
-        {
-          if (tid < s) {
-            const unsigned int xid1 = tid << 1;
-            const unsigned int xid2 = (tid + s) << 1;
-            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
-            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
-            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
-            if (tid + (s << 1) < spre)
-            {
-              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
-              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
-              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
-            }
-          }
-          __syncthreads();
-        }
-        if (tid == 0)
-        {
-          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
-          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
-          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
-        }
-        __syncthreads();
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-template <typename scalar_t>
-__global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index,
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size,
-                                                const int spatial_size,
-                                                const int num_heads,
-                                                const int channels,
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp;
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    [[maybe_unused]] const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-    const scalar_t top_grad = grad_col[index];
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear_gm(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr,
-            grad_sampling_loc, grad_attn_weight);
-        }
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-template <typename scalar_t>
-void ms_deformable_im2col_cuda(cudaStream_t stream,
-                              const scalar_t* data_value,
-                              const int64_t* data_spatial_shapes,
-                              const int64_t* data_level_start_index,
-                              const scalar_t* data_sampling_loc,
-                              const scalar_t* data_attn_weight,
-                              const int batch_size,
-                              const int spatial_size,
-                              const int num_heads,
-                              const int channels,
-                              const int num_levels,
-                              const int num_query,
-                              const int num_point,
-                              scalar_t* data_col)
-{
-  const int num_kernels = batch_size * num_query * num_heads * channels;
-  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
-  const int num_threads = CUDA_NUM_THREADS;
-  ms_deformable_im2col_gpu_kernel<scalar_t>
-      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-          0, stream>>>(
-      num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight,
-      batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess)
-  {
-    printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
-  }
-}
-template <typename scalar_t>
-void ms_deformable_col2im_cuda(cudaStream_t stream,
-                              const scalar_t* grad_col,
-                              const scalar_t* data_value,
-                              const int64_t * data_spatial_shapes,
-                              const int64_t * data_level_start_index,
-                              const scalar_t * data_sampling_loc,
-                              const scalar_t * data_attn_weight,
-                              const int batch_size,
-                              const int spatial_size,
-                              const int num_heads,
-                              const int channels,
-                              const int num_levels,
-                              const int num_query,
-                              const int num_point,
-                              scalar_t* grad_value,
-                              scalar_t* grad_sampling_loc,
-                              scalar_t* grad_attn_weight)
-{
-  const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels;
-  const int num_kernels = batch_size * num_query * num_heads * channels;
-  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
-  if (channels > 1024)
-  {
-    if ((channels & 1023) == 0)
-    {
-      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-              num_threads*3*sizeof(scalar_t), stream>>>(
-                        num_kernels,
-                        grad_col,
-                        data_value,
-                        data_spatial_shapes,
-                        data_level_start_index,
-                        data_sampling_loc,
-                        data_attn_weight,
-                        batch_size,
-                        spatial_size,
-                        num_heads,
-                        channels,
-                        num_levels,
-                        num_query,
-                        num_point,
-                        grad_value,
-                        grad_sampling_loc,
-                        grad_attn_weight);
-    }
-    else
-    {
-      ms_deformable_col2im_gpu_kernel_gm<scalar_t>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels,
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index,
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size,
-                      spatial_size,
-                      num_heads,
-                      channels,
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-    }
-  }
-  else{
-    switch(channels)
-    {
-      case 1:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 1>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels,
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index,
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size,
-                      spatial_size,
-                      num_heads,
-                      channels,
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 2:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 2>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels,
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index,
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size,
-                      spatial_size,
-                      num_heads,
-                      channels,
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 4:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 4>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels,
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index,
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size,
-                      spatial_size,
-                      num_heads,
-                      channels,
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 8:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 8>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels,
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index,
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size,
-                      spatial_size,
-                      num_heads,
-                      channels,
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 16:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 16>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels,
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index,
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size,
-                      spatial_size,
-                      num_heads,
-                      channels,
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 32:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 32>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels,
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index,
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size,
-                      spatial_size,
-                      num_heads,
-                      channels,
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 64:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 64>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels,
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index,
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size,
-                      spatial_size,
-                      num_heads,
-                      channels,
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 128:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 128>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels,
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index,
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size,
-                      spatial_size,
-                      num_heads,
-                      channels,
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 256:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 256>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels,
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index,
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size,
-                      spatial_size,
-                      num_heads,
-                      channels,
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 512:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 512>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels,
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index,
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size,
-                      spatial_size,
-                      num_heads,
-                      channels,
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 1024:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 1024>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels,
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index,
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size,
-                      spatial_size,
-                      num_heads,
-                      channels,
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      default:
-        if (channels < 64)
-        {
-          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-              num_threads*3*sizeof(scalar_t), stream>>>(
-                        num_kernels,
-                        grad_col,
-                        data_value,
-                        data_spatial_shapes,
-                        data_level_start_index,
-                        data_sampling_loc,
-                        data_attn_weight,
-                        batch_size,
-                        spatial_size,
-                        num_heads,
-                        channels,
-                        num_levels,
-                        num_query,
-                        num_point,
-                        grad_value,
-                        grad_sampling_loc,
-                        grad_attn_weight);
-        }
-        else
-        {
-          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-              num_threads*3*sizeof(scalar_t), stream>>>(
-                        num_kernels,
-                        grad_col,
-                        data_value,
-                        data_spatial_shapes,
-                        data_level_start_index,
-                        data_sampling_loc,
-                        data_attn_weight,
-                        batch_size,
-                        spatial_size,
-                        num_heads,
-                        channels,
-                        num_levels,
-                        num_query,
-                        num_point,
-                        grad_value,
-                        grad_sampling_loc,
-                        grad_attn_weight);
-        }
-    }
-  }
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess)
-  {
-    printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
-  }
-}

flake.lock DELETED Viewed

@@ -1,169 +0,0 @@
-{
-  "nodes": {
-    "flake-compat": {
-      "locked": {
-        "lastModified": 1747046372,
-        "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
-        "owner": "edolstra",
-        "repo": "flake-compat",
-        "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
-        "type": "github"
-      },
-      "original": {
-        "owner": "edolstra",
-        "repo": "flake-compat",
-        "type": "github"
-      }
-    },
-    "flake-compat_2": {
-      "locked": {
-        "lastModified": 1733328505,
-        "narHash": "sha256-NeCCThCEP3eCl2l/+27kNNK7QrwZB1IJCrXfrbv5oqU=",
-        "owner": "edolstra",
-        "repo": "flake-compat",
-        "rev": "ff81ac966bb2cae68946d5ed5fc4994f96d0ffec",
-        "type": "github"
-      },
-      "original": {
-        "owner": "edolstra",
-        "repo": "flake-compat",
-        "type": "github"
-      }
-    },
-    "flake-utils": {
-      "inputs": {
-        "systems": "systems"
-      },
-      "locked": {
-        "lastModified": 1731533236,
-        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
-        "type": "github"
-      },
-      "original": {
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "type": "github"
-      }
-    },
-    "flake-utils_2": {
-      "inputs": {
-        "systems": "systems_2"
-      },
-      "locked": {
-        "lastModified": 1731533236,
-        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
-        "type": "github"
-      },
-      "original": {
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "type": "github"
-      }
-    },
-    "hf-nix": {
-      "inputs": {
-        "flake-compat": "flake-compat_2",
-        "flake-utils": "flake-utils_2",
-        "nixpkgs": "nixpkgs"
-      },
-      "locked": {
-        "lastModified": 1753354560,
-        "narHash": "sha256-vmOfRmr0Qm/IbZTWB2sBn+UFrABSTTA/cTg+m27Yt/E=",
-        "owner": "huggingface",
-        "repo": "hf-nix",
-        "rev": "7f2aceda2a2e72cd573bdb25e5c0667fd75f89d3",
-        "type": "github"
-      },
-      "original": {
-        "owner": "huggingface",
-        "repo": "hf-nix",
-        "type": "github"
-      }
-    },
-    "kernel-builder": {
-      "inputs": {
-        "flake-compat": "flake-compat",
-        "flake-utils": "flake-utils",
-        "hf-nix": "hf-nix",
-        "nixpkgs": [
-          "kernel-builder",
-          "hf-nix",
-          "nixpkgs"
-        ]
-      },
-      "locked": {
-        "lastModified": 1753354632,
-        "narHash": "sha256-31SX3Raiyx0qCuY9JSlx9ZZgxljeUxvW+JdujjxbofQ=",
-        "owner": "huggingface",
-        "repo": "kernel-builder",
-        "rev": "524b628fd8e58525dbd28455bffb0628092c5265",
-        "type": "github"
-      },
-      "original": {
-        "owner": "huggingface",
-        "ref": "torch-2.8",
-        "repo": "kernel-builder",
-        "type": "github"
-      }
-    },
-    "nixpkgs": {
-      "locked": {
-        "lastModified": 1752785354,
-        "narHash": "sha256-Y33ryUz7MPqKrZwlbQcsYCUz2jAJCacRf8jbs0tYUlA=",
-        "owner": "nixos",
-        "repo": "nixpkgs",
-        "rev": "d38025438a6ee456758dc03188ca6873a415463b",
-        "type": "github"
-      },
-      "original": {
-        "owner": "nixos",
-        "repo": "nixpkgs",
-        "rev": "d38025438a6ee456758dc03188ca6873a415463b",
-        "type": "github"
-      }
-    },
-    "root": {
-      "inputs": {
-        "kernel-builder": "kernel-builder"
-      }
-    },
-    "systems": {
-      "locked": {
-        "lastModified": 1681028828,
-        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
-        "owner": "nix-systems",
-        "repo": "default",
-        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
-        "type": "github"
-      },
-      "original": {
-        "owner": "nix-systems",
-        "repo": "default",
-        "type": "github"
-      }
-    },
-    "systems_2": {
-      "locked": {
-        "lastModified": 1681028828,
-        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
-        "owner": "nix-systems",
-        "repo": "default",
-        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
-        "type": "github"
-      },
-      "original": {
-        "owner": "nix-systems",
-        "repo": "default",
-        "type": "github"
-      }
-    }
-  },
-  "root": "root",
-  "version": 7
-}

flake.nix DELETED Viewed

@@ -1,17 +0,0 @@
-{
-  description = "Flake for deformable_detr kernels";
-  inputs = {
-    kernel-builder.url = "github:huggingface/kernel-builder/torch-2.8";
-  };
-  outputs =
-    {
-      self,
-      kernel-builder,
-    }:
-    kernel-builder.lib.genFlakeOutputs {
-      path = ./.;
-      rev = self.shortRev or self.dirtyShortRev or self.lastModifiedDate;
-    };
-}

torch-ext/deformable_detr/__init__.py DELETED Viewed

@@ -1,46 +0,0 @@
-from typing import List
-import torch
-from ._ops import ops
-from . import layers
-def ms_deform_attn_backward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    grad_output: torch.Tensor,
-    im2col_step: int,
-) -> List[torch.Tensor]:
-    return ops.ms_deform_attn_backward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        grad_output,
-        im2col_step,
-    )
-def ms_deform_attn_forward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    im2col_step: int,
-) -> torch.Tensor:
-    return ops.ms_deform_attn_forward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        im2col_step,
-    )
-__all__ = ["layers", "ms_deform_attn_forward", "ms_deform_attn_backward"]

torch-ext/deformable_detr/layers.py DELETED Viewed

@@ -1,84 +0,0 @@
-from typing import List, Union, Tuple
-from torch import Tensor
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-import torch.nn as nn
-from ._ops import ops
-class MultiScaleDeformableAttentionFunction(Function):
-    @staticmethod
-    def forward(
-        context,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        context.im2col_step = im2col_step
-        output = ops.ms_deform_attn_forward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            context.im2col_step,
-        )
-        context.save_for_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        )
-        return output
-    @staticmethod
-    @once_differentiable
-    def backward(context, grad_output):
-        (
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        ) = context.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = ops.ms_deform_attn_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            grad_output,
-            context.im2col_step,
-        )
-        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
-class MultiScaleDeformableAttention(nn.Module):
-    def forward(
-        self,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_spatial_shapes_list: List[Tuple],
-        level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        return MultiScaleDeformableAttentionFunction.apply(
-            value,
-            value_spatial_shapes,
-            level_start_index,
-            sampling_locations,
-            attention_weights,
-            im2col_step,
-        )
-__all__ = ["MultiScaleDeformableAttention"]

torch-ext/ms_deform_attn_cpu.cpp DELETED Viewed

@@ -1,40 +0,0 @@
-/*!
-**************************************************************************************************
-* Deformable DETR
-* Copyright (c) 2020 SenseTime. All Rights Reserved.
-* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-**************************************************************************************************
-* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-**************************************************************************************************
-*/
-#include <vector>
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-at::Tensor
-ms_deform_attn_cpu_forward(
-    const at::Tensor &value,
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const int im2col_step)
-{
-    AT_ERROR("Not implement on cpu");
-}
-std::vector<at::Tensor>
-ms_deform_attn_cpu_backward(
-    const at::Tensor &value,
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const at::Tensor &grad_output,
-    const int im2col_step)
-{
-    AT_ERROR("Not implement on cpu");
-}

torch-ext/ms_deform_attn_cpu.h DELETED Viewed

@@ -1,32 +0,0 @@
-/*!
-**************************************************************************************************
-* Deformable DETR
-* Copyright (c) 2020 SenseTime. All Rights Reserved.
-* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-**************************************************************************************************
-* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-**************************************************************************************************
-*/
-#pragma once
-#include <torch/extension.h>
-at::Tensor
-ms_deform_attn_cpu_forward(
-    const at::Tensor &value,
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const int im2col_step);
-std::vector<at::Tensor>
-ms_deform_attn_cpu_backward(
-    const at::Tensor &value,
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const at::Tensor &grad_output,
-    const int im2col_step);

torch-ext/torch_binding.cpp DELETED Viewed

@@ -1,19 +0,0 @@
-#include <torch/library.h>
-#include "registration.h"
-#include "torch_binding.h"
-TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
-  ops.def("ms_deform_attn_forward(Tensor value, Tensor spatial_shapes,"
-          "                       Tensor level_start_index, Tensor sampling_loc,"
-          "                       Tensor attn_weight, int im2col_step) -> Tensor");
-  ops.impl("ms_deform_attn_forward", torch::kCUDA, &ms_deform_attn_cuda_forward);
-  ops.def("ms_deform_attn_backward(Tensor value, Tensor spatial_shapes,"
-          "                       Tensor level_start_index, Tensor sampling_loc,"
-          "                       Tensor attn_weight, Tensor grad_output,"
-          "                       int im2col_step) -> Tensor[]");
-  ops.impl("ms_deform_attn_backward", torch::kCUDA, &ms_deform_attn_cuda_backward);
-}
-REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

torch-ext/torch_binding.h DELETED Viewed

@@ -1,16 +0,0 @@
-#pragma once
-#include <torch/torch.h>
-at::Tensor ms_deform_attn_cuda_forward(const at::Tensor &value,
-                                       const at::Tensor &spatial_shapes,
-                                       const at::Tensor &level_start_index,
-                                       const at::Tensor &sampling_loc,
-                                       const at::Tensor &attn_weight,
-                                       const int64_t im2col_step);
-std::vector<at::Tensor> ms_deform_attn_cuda_backward(
-    const at::Tensor &value, const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index, const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight, const at::Tensor &grad_output,
-    const int64_t im2col_step);