diff --git a/README.md b/README.md
index f90b4cf47563fa86e0425aaf66b395433ede26ab..bf9f2ddd7ef78ebf080e4e4cefae031874c39667 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,6 @@
 ---
 license: apache-2.0
 tags:
-  - kernels
+  - kernel
 ---
 
-![Status](https://hubwebhook.dholtz.com/shield?repo=kernels-community/deformable-detr)
-
-## deformable-detr
-
-Kernel source: https://github.com/huggingface/kernels-community/tree/main/deformable-detr
diff --git a/build.toml b/build.toml
new file mode 100644
index 0000000000000000000000000000000000000000..09496f4d32b405396fab6b6aba45e3a1e51402b5
--- /dev/null
+++ b/build.toml
@@ -0,0 +1,19 @@
+[general]
+name = "deformable_detr"
+
+[torch]
+src = [
+  "torch-ext/torch_binding.cpp",
+  "torch-ext/torch_binding.h"
+]
+
+[kernel.activation]
+cuda-capabilities = [ "7.0", "7.2", "7.5", "8.0", "8.6", "8.7", "8.9", "9.0" ]
+src = [
+  "deformable_detr/ms_deform_attn_cuda.cu",
+  "deformable_detr/ms_deform_im2col_cuda.cuh",
+  "deformable_detr/ms_deform_attn_cuda.cuh",
+  "deformable_detr/ms_deform_attn_cuda.h",
+]
+include = ["."]
+depends = [ "torch" ]
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_deformable_detr_d7966ee.abi3.so b/build/torch210-cxx11-cu126-x86_64-linux/_deformable_detr_d7966ee.abi3.so
deleted file mode 100644
index 7a3799509136f864ae499158b7c71a766100148a..0000000000000000000000000000000000000000
--- a/build/torch210-cxx11-cu126-x86_64-linux/_deformable_detr_d7966ee.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:289412c0ad8ccbad14da1dff0ca83b0209fe297160bc3ccfc39788d4baf3920c
-size 8541064
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_ops.py b/build/torch210-cxx11-cu126-x86_64-linux/_ops.py
deleted file mode 100644
index b498ff9a9fd0bc22a42440001932cf97a8a9e955..0000000000000000000000000000000000000000
--- a/build/torch210-cxx11-cu126-x86_64-linux/_ops.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import torch
-from . import _deformable_detr_d7966ee
-ops = torch.ops._deformable_detr_d7966ee
-
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_deformable_detr_d7966ee::{op_name}"
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/deformable_detr/__init__.py b/build/torch210-cxx11-cu126-x86_64-linux/deformable_detr/__init__.py
deleted file mode 100644
index 03dbc1afe1cf156661a2b1b22003cd5f599a0309..0000000000000000000000000000000000000000
--- a/build/torch210-cxx11-cu126-x86_64-linux/deformable_detr/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import ctypes
-import sys
-
-import importlib
-from pathlib import Path
-from types import ModuleType
-
-def _import_from_path(file_path: Path) -> ModuleType:
-    # We cannot use the module name as-is, after adding it to `sys.modules`,
-    # it would also be used for other imports. So, we make a module name that
-    # depends on the path for it to be unique using the hex-encoded hash of
-    # the path.
-    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
-    module_name = path_hash
-    spec = importlib.util.spec_from_file_location(module_name, file_path)
-    if spec is None:
-        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
-    module = importlib.util.module_from_spec(spec)
-    if module is None:
-        raise ImportError(f"Cannot load module {module_name} from spec")
-    sys.modules[module_name] = module
-    spec.loader.exec_module(module)  # type: ignore
-    return module
-
-
-globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/metadata.json b/build/torch210-cxx11-cu126-x86_64-linux/metadata.json
deleted file mode 100644
index 9cf5deed9898dce769f4cc73913d3530b92a0bd8..0000000000000000000000000000000000000000
--- a/build/torch210-cxx11-cu126-x86_64-linux/metadata.json
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-  "version": 1,
-  "python-depends": []
-}
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/__init__.py b/build/torch210-cxx11-cu128-x86_64-linux/__init__.py
deleted file mode 100644
index 33db73ca6e361af4707ba5bb5f55bf0e7c3005a4..0000000000000000000000000000000000000000
--- a/build/torch210-cxx11-cu128-x86_64-linux/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from typing import List
-import torch
-
-from ._ops import ops
-from . import layers
-
-
-def ms_deform_attn_backward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    grad_output: torch.Tensor,
-    im2col_step: int,
-) -> List[torch.Tensor]:
-    return ops.ms_deform_attn_backward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        grad_output,
-        im2col_step,
-    )
-
-
-def ms_deform_attn_forward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    im2col_step: int,
-) -> torch.Tensor:
-    return ops.ms_deform_attn_forward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        im2col_step,
-    )
-
-
-__all__ = ["layers", "ms_deform_attn_forward", "ms_deform_attn_backward"]
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_deformable_detr_d7966ee.abi3.so b/build/torch210-cxx11-cu128-x86_64-linux/_deformable_detr_d7966ee.abi3.so
deleted file mode 100644
index d125d2d2609f765195e156df04281035d2d1b3b1..0000000000000000000000000000000000000000
--- a/build/torch210-cxx11-cu128-x86_64-linux/_deformable_detr_d7966ee.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c300927e55ee0a0692b44199f75e7fbe5f5295af264ac8a83c0c8f5f606d234a
-size 11524544
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_ops.py b/build/torch210-cxx11-cu128-x86_64-linux/_ops.py
deleted file mode 100644
index b498ff9a9fd0bc22a42440001932cf97a8a9e955..0000000000000000000000000000000000000000
--- a/build/torch210-cxx11-cu128-x86_64-linux/_ops.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import torch
-from . import _deformable_detr_d7966ee
-ops = torch.ops._deformable_detr_d7966ee
-
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_deformable_detr_d7966ee::{op_name}"
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/deformable_detr/__init__.py b/build/torch210-cxx11-cu128-x86_64-linux/deformable_detr/__init__.py
deleted file mode 100644
index 03dbc1afe1cf156661a2b1b22003cd5f599a0309..0000000000000000000000000000000000000000
--- a/build/torch210-cxx11-cu128-x86_64-linux/deformable_detr/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import ctypes
-import sys
-
-import importlib
-from pathlib import Path
-from types import ModuleType
-
-def _import_from_path(file_path: Path) -> ModuleType:
-    # We cannot use the module name as-is, after adding it to `sys.modules`,
-    # it would also be used for other imports. So, we make a module name that
-    # depends on the path for it to be unique using the hex-encoded hash of
-    # the path.
-    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
-    module_name = path_hash
-    spec = importlib.util.spec_from_file_location(module_name, file_path)
-    if spec is None:
-        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
-    module = importlib.util.module_from_spec(spec)
-    if module is None:
-        raise ImportError(f"Cannot load module {module_name} from spec")
-    sys.modules[module_name] = module
-    spec.loader.exec_module(module)  # type: ignore
-    return module
-
-
-globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/layers.py b/build/torch210-cxx11-cu128-x86_64-linux/layers.py
deleted file mode 100644
index db94032dea3d445f27017f923ae80468e18d2d77..0000000000000000000000000000000000000000
--- a/build/torch210-cxx11-cu128-x86_64-linux/layers.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from typing import List, Union, Tuple
-
-from torch import Tensor
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-import torch.nn as nn
-
-from ._ops import ops
-
-
-class MultiScaleDeformableAttentionFunction(Function):
-    @staticmethod
-    def forward(
-        context,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        context.im2col_step = im2col_step
-        output = ops.ms_deform_attn_forward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            context.im2col_step,
-        )
-        context.save_for_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        )
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(context, grad_output):
-        (
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        ) = context.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = ops.ms_deform_attn_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            grad_output,
-            context.im2col_step,
-        )
-
-        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
-
-
-class MultiScaleDeformableAttention(nn.Module):
-    def forward(
-        self,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_spatial_shapes_list: List[Tuple],
-        level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        return MultiScaleDeformableAttentionFunction.apply(
-            value,
-            value_spatial_shapes,
-            level_start_index,
-            sampling_locations,
-            attention_weights,
-            im2col_step,
-        )
-
-
-__all__ = ["MultiScaleDeformableAttention"]
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/metadata.json b/build/torch210-cxx11-cu128-x86_64-linux/metadata.json
deleted file mode 100644
index 9cf5deed9898dce769f4cc73913d3530b92a0bd8..0000000000000000000000000000000000000000
--- a/build/torch210-cxx11-cu128-x86_64-linux/metadata.json
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-  "version": 1,
-  "python-depends": []
-}
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/__init__.py b/build/torch210-cxx11-cu130-x86_64-linux/__init__.py
deleted file mode 100644
index 33db73ca6e361af4707ba5bb5f55bf0e7c3005a4..0000000000000000000000000000000000000000
--- a/build/torch210-cxx11-cu130-x86_64-linux/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from typing import List
-import torch
-
-from ._ops import ops
-from . import layers
-
-
-def ms_deform_attn_backward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    grad_output: torch.Tensor,
-    im2col_step: int,
-) -> List[torch.Tensor]:
-    return ops.ms_deform_attn_backward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        grad_output,
-        im2col_step,
-    )
-
-
-def ms_deform_attn_forward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    im2col_step: int,
-) -> torch.Tensor:
-    return ops.ms_deform_attn_forward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        im2col_step,
-    )
-
-
-__all__ = ["layers", "ms_deform_attn_forward", "ms_deform_attn_backward"]
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_deformable_detr_d7966ee.abi3.so b/build/torch210-cxx11-cu130-x86_64-linux/_deformable_detr_d7966ee.abi3.so
deleted file mode 100644
index 8ac3c9f6a167e85f4b8e5f3c95d965fbd9cd37f1..0000000000000000000000000000000000000000
--- a/build/torch210-cxx11-cu130-x86_64-linux/_deformable_detr_d7966ee.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fb099a1112b6fd48be38ae4e591d81dbb2e4aa02b423435ede5fbeb9c4d0433c
-size 9808352
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_ops.py b/build/torch210-cxx11-cu130-x86_64-linux/_ops.py
deleted file mode 100644
index b498ff9a9fd0bc22a42440001932cf97a8a9e955..0000000000000000000000000000000000000000
--- a/build/torch210-cxx11-cu130-x86_64-linux/_ops.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import torch
-from . import _deformable_detr_d7966ee
-ops = torch.ops._deformable_detr_d7966ee
-
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_deformable_detr_d7966ee::{op_name}"
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/deformable_detr/__init__.py b/build/torch210-cxx11-cu130-x86_64-linux/deformable_detr/__init__.py
deleted file mode 100644
index 03dbc1afe1cf156661a2b1b22003cd5f599a0309..0000000000000000000000000000000000000000
--- a/build/torch210-cxx11-cu130-x86_64-linux/deformable_detr/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import ctypes
-import sys
-
-import importlib
-from pathlib import Path
-from types import ModuleType
-
-def _import_from_path(file_path: Path) -> ModuleType:
-    # We cannot use the module name as-is, after adding it to `sys.modules`,
-    # it would also be used for other imports. So, we make a module name that
-    # depends on the path for it to be unique using the hex-encoded hash of
-    # the path.
-    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
-    module_name = path_hash
-    spec = importlib.util.spec_from_file_location(module_name, file_path)
-    if spec is None:
-        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
-    module = importlib.util.module_from_spec(spec)
-    if module is None:
-        raise ImportError(f"Cannot load module {module_name} from spec")
-    sys.modules[module_name] = module
-    spec.loader.exec_module(module)  # type: ignore
-    return module
-
-
-globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/layers.py b/build/torch210-cxx11-cu130-x86_64-linux/layers.py
deleted file mode 100644
index db94032dea3d445f27017f923ae80468e18d2d77..0000000000000000000000000000000000000000
--- a/build/torch210-cxx11-cu130-x86_64-linux/layers.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from typing import List, Union, Tuple
-
-from torch import Tensor
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-import torch.nn as nn
-
-from ._ops import ops
-
-
-class MultiScaleDeformableAttentionFunction(Function):
-    @staticmethod
-    def forward(
-        context,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        context.im2col_step = im2col_step
-        output = ops.ms_deform_attn_forward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            context.im2col_step,
-        )
-        context.save_for_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        )
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(context, grad_output):
-        (
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        ) = context.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = ops.ms_deform_attn_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            grad_output,
-            context.im2col_step,
-        )
-
-        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
-
-
-class MultiScaleDeformableAttention(nn.Module):
-    def forward(
-        self,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_spatial_shapes_list: List[Tuple],
-        level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        return MultiScaleDeformableAttentionFunction.apply(
-            value,
-            value_spatial_shapes,
-            level_start_index,
-            sampling_locations,
-            attention_weights,
-            im2col_step,
-        )
-
-
-__all__ = ["MultiScaleDeformableAttention"]
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/metadata.json b/build/torch210-cxx11-cu130-x86_64-linux/metadata.json
deleted file mode 100644
index 9cf5deed9898dce769f4cc73913d3530b92a0bd8..0000000000000000000000000000000000000000
--- a/build/torch210-cxx11-cu130-x86_64-linux/metadata.json
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-  "version": 1,
-  "python-depends": []
-}
\ No newline at end of file
diff --git a/build/torch25-cxx11-cu118-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so b/build/torch25-cxx11-cu118-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so
deleted file mode 100755
index f622604e89058689647ba41b3c71ccbd3aa68ae7..0000000000000000000000000000000000000000
--- a/build/torch25-cxx11-cu118-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ae00c12295a458e2534149aea16da0289541447123c19fae59baaf6d6d2752f1
-size 6693656
diff --git a/build/torch25-cxx11-cu118-x86_64-linux/deformable_detr/_deformable_detr_cxy6p3o2latjs.abi3.so b/build/torch25-cxx11-cu118-x86_64-linux/deformable_detr/_deformable_detr_cxy6p3o2latjs.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..4e4da3ee7fdd606fe0498f09c1eb4d758861af5e
--- /dev/null
+++ b/build/torch25-cxx11-cu118-x86_64-linux/deformable_detr/_deformable_detr_cxy6p3o2latjs.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1cf71a0243675c22ba3207a6f895a907b0699f964575088e054220cea5e2fb2e
+size 5870376
diff --git a/build/torch25-cxx11-cu118-x86_64-linux/deformable_detr/_ops.py b/build/torch25-cxx11-cu118-x86_64-linux/deformable_detr/_ops.py
index 676ee6fea64b714dedb7ccd1d54148dcf75575a6..ffb73bc1bbd96dfb75a830b83b58676cd1377989 100644
--- a/build/torch25-cxx11-cu118-x86_64-linux/deformable_detr/_ops.py
+++ b/build/torch25-cxx11-cu118-x86_64-linux/deformable_detr/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _deformable_detr_7c33cbe
-ops = torch.ops._deformable_detr_7c33cbe
+from . import _deformable_detr_cxy6p3o2latjs
+ops = torch.ops._deformable_detr_cxy6p3o2latjs
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_deformable_detr_7c33cbe::{op_name}"
\ No newline at end of file
+    return f"_deformable_detr_cxy6p3o2latjs::{op_name}"
\ No newline at end of file
diff --git a/build/torch25-cxx11-cu121-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so b/build/torch25-cxx11-cu121-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so
deleted file mode 100755
index 25768c52fa2f7041778baad9b1beaaee6772e8d3..0000000000000000000000000000000000000000
--- a/build/torch25-cxx11-cu121-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4ab8cf59779b768359df0fa268b6cd52be2f518dd4fafdd61baec31c64f44813
-size 6679440
diff --git a/build/torch25-cxx11-cu121-x86_64-linux/deformable_detr/_deformable_detr_esifsbuexbtbw.abi3.so b/build/torch25-cxx11-cu121-x86_64-linux/deformable_detr/_deformable_detr_esifsbuexbtbw.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..dd7fd6ca351b521f64a078d315551edad1f02f8b
--- /dev/null
+++ b/build/torch25-cxx11-cu121-x86_64-linux/deformable_detr/_deformable_detr_esifsbuexbtbw.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79dce2e84e09fb2a5bf1b47441b226343494807687d8829f141682af9b78e361
+size 5856160
diff --git a/build/torch25-cxx11-cu121-x86_64-linux/deformable_detr/_ops.py b/build/torch25-cxx11-cu121-x86_64-linux/deformable_detr/_ops.py
index 676ee6fea64b714dedb7ccd1d54148dcf75575a6..7272205607f6b6db5e3f6aa7673bc77798647317 100644
--- a/build/torch25-cxx11-cu121-x86_64-linux/deformable_detr/_ops.py
+++ b/build/torch25-cxx11-cu121-x86_64-linux/deformable_detr/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _deformable_detr_7c33cbe
-ops = torch.ops._deformable_detr_7c33cbe
+from . import _deformable_detr_esifsbuexbtbw
+ops = torch.ops._deformable_detr_esifsbuexbtbw
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_deformable_detr_7c33cbe::{op_name}"
\ No newline at end of file
+    return f"_deformable_detr_esifsbuexbtbw::{op_name}"
\ No newline at end of file
diff --git a/build/torch25-cxx11-cu124-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so b/build/torch25-cxx11-cu124-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so
deleted file mode 100755
index a574c7a01d734f9f4fb7210058d3064c280c5564..0000000000000000000000000000000000000000
--- a/build/torch25-cxx11-cu124-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5cdcd6902a03140074cff4cd44bf6b47dc27a32e13e0515a93929c66be186cab
-size 6652680
diff --git a/build/torch25-cxx11-cu124-x86_64-linux/deformable_detr/_deformable_detr_cuzn3o54ku5iq.abi3.so b/build/torch25-cxx11-cu124-x86_64-linux/deformable_detr/_deformable_detr_cuzn3o54ku5iq.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..1792ad2bb4c1dc835a4fdf2e54a4d9b0ad354ec9
--- /dev/null
+++ b/build/torch25-cxx11-cu124-x86_64-linux/deformable_detr/_deformable_detr_cuzn3o54ku5iq.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:825c7cb6f9a4350bdcdffa4383d7a527d5fa7b0d9d83222f5d1e72f1c6087841
+size 5841688
diff --git a/build/torch25-cxx11-cu124-x86_64-linux/deformable_detr/_ops.py b/build/torch25-cxx11-cu124-x86_64-linux/deformable_detr/_ops.py
index 676ee6fea64b714dedb7ccd1d54148dcf75575a6..3534aa4eb5695150d1dd7d8c0f439fb6452edca9 100644
--- a/build/torch25-cxx11-cu124-x86_64-linux/deformable_detr/_ops.py
+++ b/build/torch25-cxx11-cu124-x86_64-linux/deformable_detr/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _deformable_detr_7c33cbe
-ops = torch.ops._deformable_detr_7c33cbe
+from . import _deformable_detr_cuzn3o54ku5iq
+ops = torch.ops._deformable_detr_cuzn3o54ku5iq
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_deformable_detr_7c33cbe::{op_name}"
\ No newline at end of file
+    return f"_deformable_detr_cuzn3o54ku5iq::{op_name}"
\ No newline at end of file
diff --git a/build/torch25-cxx98-cu118-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so b/build/torch25-cxx98-cu118-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so
deleted file mode 100755
index 6360ed5a03fd9fbe757e8b7c1816f9935b13764f..0000000000000000000000000000000000000000
--- a/build/torch25-cxx98-cu118-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:82174ec2812ee672a447b94fb5ec907e348eb3d0be338daddf145a1d74969a6f
-size 6686592
diff --git a/build/torch25-cxx98-cu118-x86_64-linux/deformable_detr/_deformable_detr_gom2c5vfrl2ic.abi3.so b/build/torch25-cxx98-cu118-x86_64-linux/deformable_detr/_deformable_detr_gom2c5vfrl2ic.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..95c51f0c1f8a7e90e248bc945259220e468dfd91
--- /dev/null
+++ b/build/torch25-cxx98-cu118-x86_64-linux/deformable_detr/_deformable_detr_gom2c5vfrl2ic.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dbe4c67fc885df711581660f72d86dbd0a237c7f106308e55a484725c88e9927
+size 5863312
diff --git a/build/torch25-cxx98-cu118-x86_64-linux/deformable_detr/_ops.py b/build/torch25-cxx98-cu118-x86_64-linux/deformable_detr/_ops.py
index 676ee6fea64b714dedb7ccd1d54148dcf75575a6..734f23a0e93ba1c1e7e8bf8501df925bebdee046 100644
--- a/build/torch25-cxx98-cu118-x86_64-linux/deformable_detr/_ops.py
+++ b/build/torch25-cxx98-cu118-x86_64-linux/deformable_detr/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _deformable_detr_7c33cbe
-ops = torch.ops._deformable_detr_7c33cbe
+from . import _deformable_detr_gom2c5vfrl2ic
+ops = torch.ops._deformable_detr_gom2c5vfrl2ic
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_deformable_detr_7c33cbe::{op_name}"
\ No newline at end of file
+    return f"_deformable_detr_gom2c5vfrl2ic::{op_name}"
\ No newline at end of file
diff --git a/build/torch25-cxx98-cu121-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so b/build/torch25-cxx98-cu121-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so
deleted file mode 100755
index d23c2f012773987542cbcab34a84e3eb240bdff9..0000000000000000000000000000000000000000
--- a/build/torch25-cxx98-cu121-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c9d8540a4ffa00d331f60204fe6baf543a45667d6bba2c0a0b23aca9202b6233
-size 6672464
diff --git a/build/torch25-cxx98-cu121-x86_64-linux/deformable_detr/_deformable_detr_a7sajsuqrick6.abi3.so b/build/torch25-cxx98-cu121-x86_64-linux/deformable_detr/_deformable_detr_a7sajsuqrick6.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..0bd1ef4078bb421da435e98a7bb2f9311eda0739
--- /dev/null
+++ b/build/torch25-cxx98-cu121-x86_64-linux/deformable_detr/_deformable_detr_a7sajsuqrick6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:56b4c64eb7931a6f580bd5b806eae1aea43b3bb8c0f115d5d202f151974a5e7b
+size 5853280
diff --git a/build/torch25-cxx98-cu121-x86_64-linux/deformable_detr/_ops.py b/build/torch25-cxx98-cu121-x86_64-linux/deformable_detr/_ops.py
index 676ee6fea64b714dedb7ccd1d54148dcf75575a6..4531116c7e3fbc3258dcd23ebdbe22768a8d566a 100644
--- a/build/torch25-cxx98-cu121-x86_64-linux/deformable_detr/_ops.py
+++ b/build/torch25-cxx98-cu121-x86_64-linux/deformable_detr/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _deformable_detr_7c33cbe
-ops = torch.ops._deformable_detr_7c33cbe
+from . import _deformable_detr_a7sajsuqrick6
+ops = torch.ops._deformable_detr_a7sajsuqrick6
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_deformable_detr_7c33cbe::{op_name}"
\ No newline at end of file
+    return f"_deformable_detr_a7sajsuqrick6::{op_name}"
\ No newline at end of file
diff --git a/build/torch25-cxx98-cu124-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so b/build/torch25-cxx98-cu124-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so
deleted file mode 100755
index aaa5548734ba34766edf646277e25b48752d6b7f..0000000000000000000000000000000000000000
--- a/build/torch25-cxx98-cu124-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fa748a4de72c06de09f46b4af4fec7f23cb2c76eb8683c117fefd20833cd3fd8
-size 6649800
diff --git a/build/torch25-cxx98-cu124-x86_64-linux/deformable_detr/_deformable_detr_tyogxwmtolvok.abi3.so b/build/torch25-cxx98-cu124-x86_64-linux/deformable_detr/_deformable_detr_tyogxwmtolvok.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..b53558717a3608677dd75c82cdc027ab1374179d
--- /dev/null
+++ b/build/torch25-cxx98-cu124-x86_64-linux/deformable_detr/_deformable_detr_tyogxwmtolvok.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7eef07a96ddf574e5b1e07476089a62659a70faa33c82fc79987c54fecb2711f
+size 5834712
diff --git a/build/torch25-cxx98-cu124-x86_64-linux/deformable_detr/_ops.py b/build/torch25-cxx98-cu124-x86_64-linux/deformable_detr/_ops.py
index 676ee6fea64b714dedb7ccd1d54148dcf75575a6..b3c0318eaacce12db912a3c5e91ad578adbe74ca 100644
--- a/build/torch25-cxx98-cu124-x86_64-linux/deformable_detr/_ops.py
+++ b/build/torch25-cxx98-cu124-x86_64-linux/deformable_detr/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _deformable_detr_7c33cbe
-ops = torch.ops._deformable_detr_7c33cbe
+from . import _deformable_detr_tyogxwmtolvok
+ops = torch.ops._deformable_detr_tyogxwmtolvok
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_deformable_detr_7c33cbe::{op_name}"
\ No newline at end of file
+    return f"_deformable_detr_tyogxwmtolvok::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx11-cu118-x86_64-linux/deformable_detr/_deformable_detr_5kxpyt5yogkv2.abi3.so b/build/torch26-cxx11-cu118-x86_64-linux/deformable_detr/_deformable_detr_5kxpyt5yogkv2.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..11b825669658ee3d1a209111ce5c954f47627b64
--- /dev/null
+++ b/build/torch26-cxx11-cu118-x86_64-linux/deformable_detr/_deformable_detr_5kxpyt5yogkv2.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d1c5bb5376002363e2008eb6db64ebe0c9f6c31f9a635b7420ddfb46dce16b02
+size 5870352
diff --git a/build/torch26-cxx11-cu118-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so b/build/torch26-cxx11-cu118-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so
deleted file mode 100755
index 64ea4bcc2dfb73951fa3315247c594ea5b6aee1c..0000000000000000000000000000000000000000
--- a/build/torch26-cxx11-cu118-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:57e38bac3087c1446307e504b1e22e61ae584d1de7f5b3d15bd7a60780c3431c
-size 6693632
diff --git a/build/torch26-cxx11-cu118-x86_64-linux/deformable_detr/_ops.py b/build/torch26-cxx11-cu118-x86_64-linux/deformable_detr/_ops.py
index 676ee6fea64b714dedb7ccd1d54148dcf75575a6..68d3b35da5c2af46145714ea08cdb287d2a50d5d 100644
--- a/build/torch26-cxx11-cu118-x86_64-linux/deformable_detr/_ops.py
+++ b/build/torch26-cxx11-cu118-x86_64-linux/deformable_detr/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _deformable_detr_7c33cbe
-ops = torch.ops._deformable_detr_7c33cbe
+from . import _deformable_detr_5kxpyt5yogkv2
+ops = torch.ops._deformable_detr_5kxpyt5yogkv2
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_deformable_detr_7c33cbe::{op_name}"
\ No newline at end of file
+    return f"_deformable_detr_5kxpyt5yogkv2::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx11-cu124-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so b/build/torch26-cxx11-cu124-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so
deleted file mode 100755
index 658ac25c98e67be8affb211b20fc2af46efcbca2..0000000000000000000000000000000000000000
--- a/build/torch26-cxx11-cu124-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1d7c09d3bedd89d7119e7023a07784724d3a3f79664b75fce37b778ef3bcfe52
-size 6648656
diff --git a/build/torch26-cxx11-cu124-x86_64-linux/deformable_detr/_deformable_detr_titoehueyfqjg.abi3.so b/build/torch26-cxx11-cu124-x86_64-linux/deformable_detr/_deformable_detr_titoehueyfqjg.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..b64a727062504abecbb7bd162d6f8c6848de3b53
--- /dev/null
+++ b/build/torch26-cxx11-cu124-x86_64-linux/deformable_detr/_deformable_detr_titoehueyfqjg.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76b74d4bdbb1f562474b987fd23430d12b9f033183198f35a7dfd21fcc8ce4e1
+size 5837664
diff --git a/build/torch26-cxx11-cu124-x86_64-linux/deformable_detr/_ops.py b/build/torch26-cxx11-cu124-x86_64-linux/deformable_detr/_ops.py
index 676ee6fea64b714dedb7ccd1d54148dcf75575a6..0df7bff1cf00c94ce4da6eee12f6e49f513bc3b8 100644
--- a/build/torch26-cxx11-cu124-x86_64-linux/deformable_detr/_ops.py
+++ b/build/torch26-cxx11-cu124-x86_64-linux/deformable_detr/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _deformable_detr_7c33cbe
-ops = torch.ops._deformable_detr_7c33cbe
+from . import _deformable_detr_titoehueyfqjg
+ops = torch.ops._deformable_detr_titoehueyfqjg
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_deformable_detr_7c33cbe::{op_name}"
\ No newline at end of file
+    return f"_deformable_detr_titoehueyfqjg::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx11-cu126-aarch64-linux/deformable_detr/__init__.py b/build/torch26-cxx11-cu126-aarch64-linux/deformable_detr/__init__.py
deleted file mode 100644
index 33db73ca6e361af4707ba5bb5f55bf0e7c3005a4..0000000000000000000000000000000000000000
--- a/build/torch26-cxx11-cu126-aarch64-linux/deformable_detr/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from typing import List
-import torch
-
-from ._ops import ops
-from . import layers
-
-
-def ms_deform_attn_backward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    grad_output: torch.Tensor,
-    im2col_step: int,
-) -> List[torch.Tensor]:
-    return ops.ms_deform_attn_backward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        grad_output,
-        im2col_step,
-    )
-
-
-def ms_deform_attn_forward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    im2col_step: int,
-) -> torch.Tensor:
-    return ops.ms_deform_attn_forward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        im2col_step,
-    )
-
-
-__all__ = ["layers", "ms_deform_attn_forward", "ms_deform_attn_backward"]
diff --git a/build/torch26-cxx11-cu126-aarch64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so b/build/torch26-cxx11-cu126-aarch64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so
deleted file mode 100755
index d1b1fe211439d00ca4681305f56e91b4027fe45e..0000000000000000000000000000000000000000
--- a/build/torch26-cxx11-cu126-aarch64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6856b6efe5130f019f6cb7f964d7a2073f1ecc5cd7afc850334e64798f871dae
-size 6833224
diff --git a/build/torch26-cxx11-cu126-aarch64-linux/deformable_detr/_ops.py b/build/torch26-cxx11-cu126-aarch64-linux/deformable_detr/_ops.py
deleted file mode 100644
index 676ee6fea64b714dedb7ccd1d54148dcf75575a6..0000000000000000000000000000000000000000
--- a/build/torch26-cxx11-cu126-aarch64-linux/deformable_detr/_ops.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import torch
-from . import _deformable_detr_7c33cbe
-ops = torch.ops._deformable_detr_7c33cbe
-
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_deformable_detr_7c33cbe::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx11-cu126-aarch64-linux/deformable_detr/layers.py b/build/torch26-cxx11-cu126-aarch64-linux/deformable_detr/layers.py
deleted file mode 100644
index db94032dea3d445f27017f923ae80468e18d2d77..0000000000000000000000000000000000000000
--- a/build/torch26-cxx11-cu126-aarch64-linux/deformable_detr/layers.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from typing import List, Union, Tuple
-
-from torch import Tensor
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-import torch.nn as nn
-
-from ._ops import ops
-
-
-class MultiScaleDeformableAttentionFunction(Function):
-    @staticmethod
-    def forward(
-        context,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        context.im2col_step = im2col_step
-        output = ops.ms_deform_attn_forward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            context.im2col_step,
-        )
-        context.save_for_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        )
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(context, grad_output):
-        (
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        ) = context.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = ops.ms_deform_attn_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            grad_output,
-            context.im2col_step,
-        )
-
-        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
-
-
-class MultiScaleDeformableAttention(nn.Module):
-    def forward(
-        self,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_spatial_shapes_list: List[Tuple],
-        level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        return MultiScaleDeformableAttentionFunction.apply(
-            value,
-            value_spatial_shapes,
-            level_start_index,
-            sampling_locations,
-            attention_weights,
-            im2col_step,
-        )
-
-
-__all__ = ["MultiScaleDeformableAttention"]
diff --git a/build/torch26-cxx11-cu126-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so b/build/torch26-cxx11-cu126-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so
deleted file mode 100755
index 10bcc5c4bad459abe354c507166121a5a47d5d74..0000000000000000000000000000000000000000
--- a/build/torch26-cxx11-cu126-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c76ad874b78882d3108a7fdaf49f8c00b6a6a7dceec63912118f8fa7d07e5f30
-size 6800656
diff --git a/build/torch26-cxx11-cu126-x86_64-linux/deformable_detr/_deformable_detr_imqt5tuqtmyt4.abi3.so b/build/torch26-cxx11-cu126-x86_64-linux/deformable_detr/_deformable_detr_imqt5tuqtmyt4.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..61011f23d656b543339a1eb2e1481684e3d66e10
--- /dev/null
+++ b/build/torch26-cxx11-cu126-x86_64-linux/deformable_detr/_deformable_detr_imqt5tuqtmyt4.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1acd032c2f3bc530872e0839d8bec8950b01668c913539a2e14008a1e652560f
+size 5944608
diff --git a/build/torch26-cxx11-cu126-x86_64-linux/deformable_detr/_ops.py b/build/torch26-cxx11-cu126-x86_64-linux/deformable_detr/_ops.py
index 676ee6fea64b714dedb7ccd1d54148dcf75575a6..4ea7469cf391720f056311bda128fbec93749437 100644
--- a/build/torch26-cxx11-cu126-x86_64-linux/deformable_detr/_ops.py
+++ b/build/torch26-cxx11-cu126-x86_64-linux/deformable_detr/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _deformable_detr_7c33cbe
-ops = torch.ops._deformable_detr_7c33cbe
+from . import _deformable_detr_imqt5tuqtmyt4
+ops = torch.ops._deformable_detr_imqt5tuqtmyt4
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_deformable_detr_7c33cbe::{op_name}"
\ No newline at end of file
+    return f"_deformable_detr_imqt5tuqtmyt4::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx98-cu118-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so b/build/torch26-cxx98-cu118-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so
deleted file mode 100755
index a15ec9d8735e3de07ebd3bfaae1b2ded8de87447..0000000000000000000000000000000000000000
--- a/build/torch26-cxx98-cu118-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d1fb3a24fd95c1cc3cba080ae1c9d4217f377435770c7e423de53b11ecc437dc
-size 6686600
diff --git a/build/torch26-cxx98-cu118-x86_64-linux/deformable_detr/_deformable_detr_qbnaho3zp2d3o.abi3.so b/build/torch26-cxx98-cu118-x86_64-linux/deformable_detr/_deformable_detr_qbnaho3zp2d3o.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..353504e4fa52b281748f32dfcee02781e450794a
--- /dev/null
+++ b/build/torch26-cxx98-cu118-x86_64-linux/deformable_detr/_deformable_detr_qbnaho3zp2d3o.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9e5074a5afdb137688e20182cf4c9f7cbb1e8a69651c08a570076aeedc8c76b
+size 5863320
diff --git a/build/torch26-cxx98-cu118-x86_64-linux/deformable_detr/_ops.py b/build/torch26-cxx98-cu118-x86_64-linux/deformable_detr/_ops.py
index 676ee6fea64b714dedb7ccd1d54148dcf75575a6..ed42e13a067a186a1f701cec376afa3194956f6b 100644
--- a/build/torch26-cxx98-cu118-x86_64-linux/deformable_detr/_ops.py
+++ b/build/torch26-cxx98-cu118-x86_64-linux/deformable_detr/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _deformable_detr_7c33cbe
-ops = torch.ops._deformable_detr_7c33cbe
+from . import _deformable_detr_qbnaho3zp2d3o
+ops = torch.ops._deformable_detr_qbnaho3zp2d3o
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_deformable_detr_7c33cbe::{op_name}"
\ No newline at end of file
+    return f"_deformable_detr_qbnaho3zp2d3o::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx98-cu124-x86_64-linux/deformable_detr/_deformable_detr_5oxft6tr6jbvu.abi3.so b/build/torch26-cxx98-cu124-x86_64-linux/deformable_detr/_deformable_detr_5oxft6tr6jbvu.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..e799c57967e1132680a7e360d5cb6b197932b787
--- /dev/null
+++ b/build/torch26-cxx98-cu124-x86_64-linux/deformable_detr/_deformable_detr_5oxft6tr6jbvu.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd4d0f47c165b9ce95c0328cb7a52e331e4c698746ea8e4d43c7d09c193e34bd
+size 5834720
diff --git a/build/torch26-cxx98-cu124-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so b/build/torch26-cxx98-cu124-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so
deleted file mode 100755
index 63c34c123c4b78349bc1a2bb92b0614c4b689a13..0000000000000000000000000000000000000000
--- a/build/torch26-cxx98-cu124-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d5a455975be5790964cc95c6813d293b1aba581f5c2dc132c9a08690bf6e5cad
-size 6649808
diff --git a/build/torch26-cxx98-cu124-x86_64-linux/deformable_detr/_ops.py b/build/torch26-cxx98-cu124-x86_64-linux/deformable_detr/_ops.py
index 676ee6fea64b714dedb7ccd1d54148dcf75575a6..bb7524858e03d0f913bd747f4c79db60d2737aa2 100644
--- a/build/torch26-cxx98-cu124-x86_64-linux/deformable_detr/_ops.py
+++ b/build/torch26-cxx98-cu124-x86_64-linux/deformable_detr/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _deformable_detr_7c33cbe
-ops = torch.ops._deformable_detr_7c33cbe
+from . import _deformable_detr_5oxft6tr6jbvu
+ops = torch.ops._deformable_detr_5oxft6tr6jbvu
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_deformable_detr_7c33cbe::{op_name}"
\ No newline at end of file
+    return f"_deformable_detr_5oxft6tr6jbvu::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx98-cu126-aarch64-linux/deformable_detr/__init__.py b/build/torch26-cxx98-cu126-aarch64-linux/deformable_detr/__init__.py
deleted file mode 100644
index 33db73ca6e361af4707ba5bb5f55bf0e7c3005a4..0000000000000000000000000000000000000000
--- a/build/torch26-cxx98-cu126-aarch64-linux/deformable_detr/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from typing import List
-import torch
-
-from ._ops import ops
-from . import layers
-
-
-def ms_deform_attn_backward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    grad_output: torch.Tensor,
-    im2col_step: int,
-) -> List[torch.Tensor]:
-    return ops.ms_deform_attn_backward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        grad_output,
-        im2col_step,
-    )
-
-
-def ms_deform_attn_forward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    im2col_step: int,
-) -> torch.Tensor:
-    return ops.ms_deform_attn_forward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        im2col_step,
-    )
-
-
-__all__ = ["layers", "ms_deform_attn_forward", "ms_deform_attn_backward"]
diff --git a/build/torch26-cxx98-cu126-aarch64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so b/build/torch26-cxx98-cu126-aarch64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so
deleted file mode 100755
index 6740aef68abf0d71d0756d6b2f78fc2da67c6752..0000000000000000000000000000000000000000
--- a/build/torch26-cxx98-cu126-aarch64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2c13dce2b080676eb192d87ba83df6ef1f6d0f1101727f4b29185d48dec7281d
-size 6829872
diff --git a/build/torch26-cxx98-cu126-aarch64-linux/deformable_detr/_ops.py b/build/torch26-cxx98-cu126-aarch64-linux/deformable_detr/_ops.py
deleted file mode 100644
index 676ee6fea64b714dedb7ccd1d54148dcf75575a6..0000000000000000000000000000000000000000
--- a/build/torch26-cxx98-cu126-aarch64-linux/deformable_detr/_ops.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import torch
-from . import _deformable_detr_7c33cbe
-ops = torch.ops._deformable_detr_7c33cbe
-
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_deformable_detr_7c33cbe::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx98-cu126-aarch64-linux/deformable_detr/layers.py b/build/torch26-cxx98-cu126-aarch64-linux/deformable_detr/layers.py
deleted file mode 100644
index db94032dea3d445f27017f923ae80468e18d2d77..0000000000000000000000000000000000000000
--- a/build/torch26-cxx98-cu126-aarch64-linux/deformable_detr/layers.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from typing import List, Union, Tuple
-
-from torch import Tensor
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-import torch.nn as nn
-
-from ._ops import ops
-
-
-class MultiScaleDeformableAttentionFunction(Function):
-    @staticmethod
-    def forward(
-        context,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        context.im2col_step = im2col_step
-        output = ops.ms_deform_attn_forward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            context.im2col_step,
-        )
-        context.save_for_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        )
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(context, grad_output):
-        (
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        ) = context.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = ops.ms_deform_attn_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            grad_output,
-            context.im2col_step,
-        )
-
-        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
-
-
-class MultiScaleDeformableAttention(nn.Module):
-    def forward(
-        self,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_spatial_shapes_list: List[Tuple],
-        level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        return MultiScaleDeformableAttentionFunction.apply(
-            value,
-            value_spatial_shapes,
-            level_start_index,
-            sampling_locations,
-            attention_weights,
-            im2col_step,
-        )
-
-
-__all__ = ["MultiScaleDeformableAttention"]
diff --git a/build/torch26-cxx98-cu126-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so b/build/torch26-cxx98-cu126-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so
deleted file mode 100755
index f3ce58cab856be1f382aa9f3cf1bb0b1ec06d41d..0000000000000000000000000000000000000000
--- a/build/torch26-cxx98-cu126-x86_64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c6ad344319579f0abef7fe1a9d3f479f1c8737994f563a540815a1445020959e
-size 6797712
diff --git a/build/torch26-cxx98-cu126-x86_64-linux/deformable_detr/_deformable_detr_po264mz2i2ffg.abi3.so b/build/torch26-cxx98-cu126-x86_64-linux/deformable_detr/_deformable_detr_po264mz2i2ffg.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..952c1115177dbfead31b4cfe79df0fe6175b0fe1
--- /dev/null
+++ b/build/torch26-cxx98-cu126-x86_64-linux/deformable_detr/_deformable_detr_po264mz2i2ffg.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:129844ba533ee201cd3f2bb0e17a354ee8aa35176c10896454926485acdacdac
+size 5945760
diff --git a/build/torch26-cxx98-cu126-x86_64-linux/deformable_detr/_ops.py b/build/torch26-cxx98-cu126-x86_64-linux/deformable_detr/_ops.py
index 676ee6fea64b714dedb7ccd1d54148dcf75575a6..c411feebca4fae25684ed9b07abfdc21f1c55488 100644
--- a/build/torch26-cxx98-cu126-x86_64-linux/deformable_detr/_ops.py
+++ b/build/torch26-cxx98-cu126-x86_64-linux/deformable_detr/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _deformable_detr_7c33cbe
-ops = torch.ops._deformable_detr_7c33cbe
+from . import _deformable_detr_po264mz2i2ffg
+ops = torch.ops._deformable_detr_po264mz2i2ffg
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_deformable_detr_7c33cbe::{op_name}"
\ No newline at end of file
+    return f"_deformable_detr_po264mz2i2ffg::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/deformable_detr/__init__.py b/build/torch27-cxx11-cu118-x86_64-linux/deformable_detr/__init__.py
deleted file mode 100644
index 33db73ca6e361af4707ba5bb5f55bf0e7c3005a4..0000000000000000000000000000000000000000
--- a/build/torch27-cxx11-cu118-x86_64-linux/deformable_detr/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from typing import List
-import torch
-
-from ._ops import ops
-from . import layers
-
-
-def ms_deform_attn_backward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    grad_output: torch.Tensor,
-    im2col_step: int,
-) -> List[torch.Tensor]:
-    return ops.ms_deform_attn_backward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        grad_output,
-        im2col_step,
-    )
-
-
-def ms_deform_attn_forward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    im2col_step: int,
-) -> torch.Tensor:
-    return ops.ms_deform_attn_forward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        im2col_step,
-    )
-
-
-__all__ = ["layers", "ms_deform_attn_forward", "ms_deform_attn_backward"]
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/deformable_detr/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu118-x86_64-linux/deformable_detr/__pycache__/__init__.cpython-313.pyc
deleted file mode 100644
index a18bf078a3920470e28c50b9bc3b7efffad37d72..0000000000000000000000000000000000000000
Binary files a/build/torch27-cxx11-cu118-x86_64-linux/deformable_detr/__pycache__/__init__.cpython-313.pyc and /dev/null differ
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/deformable_detr/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-cu118-x86_64-linux/deformable_detr/__pycache__/_ops.cpython-313.pyc
deleted file mode 100644
index 8a6f280954dd80fbb6d49390c446f50f2924d3fd..0000000000000000000000000000000000000000
Binary files a/build/torch27-cxx11-cu118-x86_64-linux/deformable_detr/__pycache__/_ops.cpython-313.pyc and /dev/null differ
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/deformable_detr/__pycache__/layers.cpython-313.pyc b/build/torch27-cxx11-cu118-x86_64-linux/deformable_detr/__pycache__/layers.cpython-313.pyc
deleted file mode 100644
index 097f24d7bc07674acd326d3a99991123d90a5838..0000000000000000000000000000000000000000
Binary files a/build/torch27-cxx11-cu118-x86_64-linux/deformable_detr/__pycache__/layers.cpython-313.pyc and /dev/null differ
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/deformable_detr/_deformable_detr_57c3d32.abi3.so b/build/torch27-cxx11-cu118-x86_64-linux/deformable_detr/_deformable_detr_57c3d32.abi3.so
deleted file mode 100644
index f7dc97abb0f55df13905e5275d76fa859beb7a30..0000000000000000000000000000000000000000
--- a/build/torch27-cxx11-cu118-x86_64-linux/deformable_detr/_deformable_detr_57c3d32.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9293f97cc6b06bc3ba5e57cfd084abb252c287f4518935208e67e126e7cbd19b
-size 6800224
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/deformable_detr/_ops.py b/build/torch27-cxx11-cu118-x86_64-linux/deformable_detr/_ops.py
deleted file mode 100644
index 39c2aa7875432779e86612a0e56271fe32133953..0000000000000000000000000000000000000000
--- a/build/torch27-cxx11-cu118-x86_64-linux/deformable_detr/_ops.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import torch
-from . import _deformable_detr_57c3d32
-ops = torch.ops._deformable_detr_57c3d32
-
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_deformable_detr_57c3d32::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/deformable_detr/layers.py b/build/torch27-cxx11-cu118-x86_64-linux/deformable_detr/layers.py
deleted file mode 100644
index db94032dea3d445f27017f923ae80468e18d2d77..0000000000000000000000000000000000000000
--- a/build/torch27-cxx11-cu118-x86_64-linux/deformable_detr/layers.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from typing import List, Union, Tuple
-
-from torch import Tensor
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-import torch.nn as nn
-
-from ._ops import ops
-
-
-class MultiScaleDeformableAttentionFunction(Function):
-    @staticmethod
-    def forward(
-        context,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        context.im2col_step = im2col_step
-        output = ops.ms_deform_attn_forward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            context.im2col_step,
-        )
-        context.save_for_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        )
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(context, grad_output):
-        (
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        ) = context.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = ops.ms_deform_attn_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            grad_output,
-            context.im2col_step,
-        )
-
-        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
-
-
-class MultiScaleDeformableAttention(nn.Module):
-    def forward(
-        self,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_spatial_shapes_list: List[Tuple],
-        level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        return MultiScaleDeformableAttentionFunction.apply(
-            value,
-            value_spatial_shapes,
-            level_start_index,
-            sampling_locations,
-            attention_weights,
-            im2col_step,
-        )
-
-
-__all__ = ["MultiScaleDeformableAttention"]
diff --git a/build/torch27-cxx11-cu126-aarch64-linux/deformable_detr/__init__.py b/build/torch27-cxx11-cu126-aarch64-linux/deformable_detr/__init__.py
deleted file mode 100644
index 33db73ca6e361af4707ba5bb5f55bf0e7c3005a4..0000000000000000000000000000000000000000
--- a/build/torch27-cxx11-cu126-aarch64-linux/deformable_detr/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from typing import List
-import torch
-
-from ._ops import ops
-from . import layers
-
-
-def ms_deform_attn_backward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    grad_output: torch.Tensor,
-    im2col_step: int,
-) -> List[torch.Tensor]:
-    return ops.ms_deform_attn_backward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        grad_output,
-        im2col_step,
-    )
-
-
-def ms_deform_attn_forward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    im2col_step: int,
-) -> torch.Tensor:
-    return ops.ms_deform_attn_forward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        im2col_step,
-    )
-
-
-__all__ = ["layers", "ms_deform_attn_forward", "ms_deform_attn_backward"]
diff --git a/build/torch27-cxx11-cu126-aarch64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so b/build/torch27-cxx11-cu126-aarch64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so
deleted file mode 100755
index 63b666838685ae29d11438e79273b09b38f9df39..0000000000000000000000000000000000000000
--- a/build/torch27-cxx11-cu126-aarch64-linux/deformable_detr/_deformable_detr_7c33cbe.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:af2831b68229a910e8703cae2c9e720ded825e401745d38923548c444e56c37b
-size 6833456
diff --git a/build/torch27-cxx11-cu126-aarch64-linux/deformable_detr/_ops.py b/build/torch27-cxx11-cu126-aarch64-linux/deformable_detr/_ops.py
deleted file mode 100644
index 676ee6fea64b714dedb7ccd1d54148dcf75575a6..0000000000000000000000000000000000000000
--- a/build/torch27-cxx11-cu126-aarch64-linux/deformable_detr/_ops.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import torch
-from . import _deformable_detr_7c33cbe
-ops = torch.ops._deformable_detr_7c33cbe
-
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_deformable_detr_7c33cbe::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu126-aarch64-linux/deformable_detr/layers.py b/build/torch27-cxx11-cu126-aarch64-linux/deformable_detr/layers.py
deleted file mode 100644
index db94032dea3d445f27017f923ae80468e18d2d77..0000000000000000000000000000000000000000
--- a/build/torch27-cxx11-cu126-aarch64-linux/deformable_detr/layers.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from typing import List, Union, Tuple
-
-from torch import Tensor
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-import torch.nn as nn
-
-from ._ops import ops
-
-
-class MultiScaleDeformableAttentionFunction(Function):
-    @staticmethod
-    def forward(
-        context,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        context.im2col_step = im2col_step
-        output = ops.ms_deform_attn_forward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            context.im2col_step,
-        )
-        context.save_for_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        )
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(context, grad_output):
-        (
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        ) = context.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = ops.ms_deform_attn_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            grad_output,
-            context.im2col_step,
-        )
-
-        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
-
-
-class MultiScaleDeformableAttention(nn.Module):
-    def forward(
-        self,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_spatial_shapes_list: List[Tuple],
-        level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        return MultiScaleDeformableAttentionFunction.apply(
-            value,
-            value_spatial_shapes,
-            level_start_index,
-            sampling_locations,
-            attention_weights,
-            im2col_step,
-        )
-
-
-__all__ = ["MultiScaleDeformableAttention"]
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/deformable_detr/__init__.py b/build/torch27-cxx11-cu126-x86_64-linux/deformable_detr/__init__.py
deleted file mode 100644
index 33db73ca6e361af4707ba5bb5f55bf0e7c3005a4..0000000000000000000000000000000000000000
--- a/build/torch27-cxx11-cu126-x86_64-linux/deformable_detr/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from typing import List
-import torch
-
-from ._ops import ops
-from . import layers
-
-
-def ms_deform_attn_backward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    grad_output: torch.Tensor,
-    im2col_step: int,
-) -> List[torch.Tensor]:
-    return ops.ms_deform_attn_backward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        grad_output,
-        im2col_step,
-    )
-
-
-def ms_deform_attn_forward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    im2col_step: int,
-) -> torch.Tensor:
-    return ops.ms_deform_attn_forward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        im2col_step,
-    )
-
-
-__all__ = ["layers", "ms_deform_attn_forward", "ms_deform_attn_backward"]
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/deformable_detr/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/deformable_detr/__pycache__/__init__.cpython-313.pyc
deleted file mode 100644
index 03f23b4fd38706ba1888acf199b905f851d2b024..0000000000000000000000000000000000000000
Binary files a/build/torch27-cxx11-cu126-x86_64-linux/deformable_detr/__pycache__/__init__.cpython-313.pyc and /dev/null differ
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/deformable_detr/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/deformable_detr/__pycache__/_ops.cpython-313.pyc
deleted file mode 100644
index cca62822318d51b9dbb51466f3e0a6d5fd388306..0000000000000000000000000000000000000000
Binary files a/build/torch27-cxx11-cu126-x86_64-linux/deformable_detr/__pycache__/_ops.cpython-313.pyc and /dev/null differ
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/deformable_detr/__pycache__/layers.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/deformable_detr/__pycache__/layers.cpython-313.pyc
deleted file mode 100644
index 7dfb8dda54ffc9062a129331374926b1fe104ecd..0000000000000000000000000000000000000000
Binary files a/build/torch27-cxx11-cu126-x86_64-linux/deformable_detr/__pycache__/layers.cpython-313.pyc and /dev/null differ
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/deformable_detr/_deformable_detr_57c3d32.abi3.so b/build/torch27-cxx11-cu126-x86_64-linux/deformable_detr/_deformable_detr_57c3d32.abi3.so
deleted file mode 100644
index abe151e58a46c02191f11213027731ed26b7a182..0000000000000000000000000000000000000000
--- a/build/torch27-cxx11-cu126-x86_64-linux/deformable_detr/_deformable_detr_57c3d32.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:df54f46f59b5b78b15314cb0825d8f1f34c7a3198e9d62ca2a65a8ca72ea79a4
-size 6911280
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/deformable_detr/_ops.py b/build/torch27-cxx11-cu126-x86_64-linux/deformable_detr/_ops.py
deleted file mode 100644
index 39c2aa7875432779e86612a0e56271fe32133953..0000000000000000000000000000000000000000
--- a/build/torch27-cxx11-cu126-x86_64-linux/deformable_detr/_ops.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import torch
-from . import _deformable_detr_57c3d32
-ops = torch.ops._deformable_detr_57c3d32
-
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_deformable_detr_57c3d32::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/deformable_detr/layers.py b/build/torch27-cxx11-cu126-x86_64-linux/deformable_detr/layers.py
deleted file mode 100644
index db94032dea3d445f27017f923ae80468e18d2d77..0000000000000000000000000000000000000000
--- a/build/torch27-cxx11-cu126-x86_64-linux/deformable_detr/layers.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from typing import List, Union, Tuple
-
-from torch import Tensor
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-import torch.nn as nn
-
-from ._ops import ops
-
-
-class MultiScaleDeformableAttentionFunction(Function):
-    @staticmethod
-    def forward(
-        context,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        context.im2col_step = im2col_step
-        output = ops.ms_deform_attn_forward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            context.im2col_step,
-        )
-        context.save_for_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        )
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(context, grad_output):
-        (
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        ) = context.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = ops.ms_deform_attn_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            grad_output,
-            context.im2col_step,
-        )
-
-        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
-
-
-class MultiScaleDeformableAttention(nn.Module):
-    def forward(
-        self,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_spatial_shapes_list: List[Tuple],
-        level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        return MultiScaleDeformableAttentionFunction.apply(
-            value,
-            value_spatial_shapes,
-            level_start_index,
-            sampling_locations,
-            attention_weights,
-            im2col_step,
-        )
-
-
-__all__ = ["MultiScaleDeformableAttention"]
diff --git a/build/torch27-cxx11-cu128-aarch64-linux/deformable_detr/__init__.py b/build/torch27-cxx11-cu128-aarch64-linux/deformable_detr/__init__.py
deleted file mode 100644
index 33db73ca6e361af4707ba5bb5f55bf0e7c3005a4..0000000000000000000000000000000000000000
--- a/build/torch27-cxx11-cu128-aarch64-linux/deformable_detr/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from typing import List
-import torch
-
-from ._ops import ops
-from . import layers
-
-
-def ms_deform_attn_backward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    grad_output: torch.Tensor,
-    im2col_step: int,
-) -> List[torch.Tensor]:
-    return ops.ms_deform_attn_backward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        grad_output,
-        im2col_step,
-    )
-
-
-def ms_deform_attn_forward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    im2col_step: int,
-) -> torch.Tensor:
-    return ops.ms_deform_attn_forward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        im2col_step,
-    )
-
-
-__all__ = ["layers", "ms_deform_attn_forward", "ms_deform_attn_backward"]
diff --git a/build/torch27-cxx11-cu128-aarch64-linux/deformable_detr/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu128-aarch64-linux/deformable_detr/__pycache__/__init__.cpython-313.pyc
deleted file mode 100644
index 2c7fa735b9a5720691a5ed6c31e566b5bc81dd77..0000000000000000000000000000000000000000
Binary files a/build/torch27-cxx11-cu128-aarch64-linux/deformable_detr/__pycache__/__init__.cpython-313.pyc and /dev/null differ
diff --git a/build/torch27-cxx11-cu128-aarch64-linux/deformable_detr/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-cu128-aarch64-linux/deformable_detr/__pycache__/_ops.cpython-313.pyc
deleted file mode 100644
index 499ecc259157bcef7916fea79bdd097407854ba3..0000000000000000000000000000000000000000
Binary files a/build/torch27-cxx11-cu128-aarch64-linux/deformable_detr/__pycache__/_ops.cpython-313.pyc and /dev/null differ
diff --git a/build/torch27-cxx11-cu128-aarch64-linux/deformable_detr/__pycache__/layers.cpython-313.pyc b/build/torch27-cxx11-cu128-aarch64-linux/deformable_detr/__pycache__/layers.cpython-313.pyc
deleted file mode 100644
index 5483ccd7f0aaaeffd23797fbec5d47f3e8f49080..0000000000000000000000000000000000000000
Binary files a/build/torch27-cxx11-cu128-aarch64-linux/deformable_detr/__pycache__/layers.cpython-313.pyc and /dev/null differ
diff --git a/build/torch27-cxx11-cu128-aarch64-linux/deformable_detr/_deformable_detr_320b408.abi3.so b/build/torch27-cxx11-cu128-aarch64-linux/deformable_detr/_deformable_detr_320b408.abi3.so
deleted file mode 100644
index 0c564ee29cec2888bffa2ce31732d90efb502625..0000000000000000000000000000000000000000
--- a/build/torch27-cxx11-cu128-aarch64-linux/deformable_detr/_deformable_detr_320b408.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8926ca42814a03cdbac750f3a0cd3e3cbc28614a58e1ca5a77e82b3ad0148043
-size 9979264
diff --git a/build/torch27-cxx11-cu128-aarch64-linux/deformable_detr/_ops.py b/build/torch27-cxx11-cu128-aarch64-linux/deformable_detr/_ops.py
deleted file mode 100644
index 155c52bfca54e55425b639314b289e668c5a6ec2..0000000000000000000000000000000000000000
--- a/build/torch27-cxx11-cu128-aarch64-linux/deformable_detr/_ops.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import torch
-from . import _deformable_detr_320b408
-ops = torch.ops._deformable_detr_320b408
-
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_deformable_detr_320b408::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu128-aarch64-linux/deformable_detr/layers.py b/build/torch27-cxx11-cu128-aarch64-linux/deformable_detr/layers.py
deleted file mode 100644
index db94032dea3d445f27017f923ae80468e18d2d77..0000000000000000000000000000000000000000
--- a/build/torch27-cxx11-cu128-aarch64-linux/deformable_detr/layers.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from typing import List, Union, Tuple
-
-from torch import Tensor
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-import torch.nn as nn
-
-from ._ops import ops
-
-
-class MultiScaleDeformableAttentionFunction(Function):
-    @staticmethod
-    def forward(
-        context,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        context.im2col_step = im2col_step
-        output = ops.ms_deform_attn_forward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            context.im2col_step,
-        )
-        context.save_for_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        )
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(context, grad_output):
-        (
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        ) = context.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = ops.ms_deform_attn_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            grad_output,
-            context.im2col_step,
-        )
-
-        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
-
-
-class MultiScaleDeformableAttention(nn.Module):
-    def forward(
-        self,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_spatial_shapes_list: List[Tuple],
-        level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        return MultiScaleDeformableAttentionFunction.apply(
-            value,
-            value_spatial_shapes,
-            level_start_index,
-            sampling_locations,
-            attention_weights,
-            im2col_step,
-        )
-
-
-__all__ = ["MultiScaleDeformableAttention"]
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/deformable_detr/__init__.py b/build/torch27-cxx11-cu128-x86_64-linux/deformable_detr/__init__.py
deleted file mode 100644
index 33db73ca6e361af4707ba5bb5f55bf0e7c3005a4..0000000000000000000000000000000000000000
--- a/build/torch27-cxx11-cu128-x86_64-linux/deformable_detr/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from typing import List
-import torch
-
-from ._ops import ops
-from . import layers
-
-
-def ms_deform_attn_backward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    grad_output: torch.Tensor,
-    im2col_step: int,
-) -> List[torch.Tensor]:
-    return ops.ms_deform_attn_backward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        grad_output,
-        im2col_step,
-    )
-
-
-def ms_deform_attn_forward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    im2col_step: int,
-) -> torch.Tensor:
-    return ops.ms_deform_attn_forward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        im2col_step,
-    )
-
-
-__all__ = ["layers", "ms_deform_attn_forward", "ms_deform_attn_backward"]
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/deformable_detr/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/deformable_detr/__pycache__/__init__.cpython-313.pyc
deleted file mode 100644
index 381ef42a3282d0c2787e94f3c29d999e4736d7c8..0000000000000000000000000000000000000000
Binary files a/build/torch27-cxx11-cu128-x86_64-linux/deformable_detr/__pycache__/__init__.cpython-313.pyc and /dev/null differ
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/deformable_detr/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/deformable_detr/__pycache__/_ops.cpython-313.pyc
deleted file mode 100644
index 9291f79801323c5a3efc995f9c90ffa6cc35a50c..0000000000000000000000000000000000000000
Binary files a/build/torch27-cxx11-cu128-x86_64-linux/deformable_detr/__pycache__/_ops.cpython-313.pyc and /dev/null differ
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/deformable_detr/__pycache__/layers.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/deformable_detr/__pycache__/layers.cpython-313.pyc
deleted file mode 100644
index 94595475894bf15ef880e78d37ec23bca83bed08..0000000000000000000000000000000000000000
Binary files a/build/torch27-cxx11-cu128-x86_64-linux/deformable_detr/__pycache__/layers.cpython-313.pyc and /dev/null differ
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/deformable_detr/_deformable_detr_57c3d32.abi3.so b/build/torch27-cxx11-cu128-x86_64-linux/deformable_detr/_deformable_detr_57c3d32.abi3.so
deleted file mode 100644
index 2f388366cf92d9a815c26db28ca4a855f8e713ba..0000000000000000000000000000000000000000
--- a/build/torch27-cxx11-cu128-x86_64-linux/deformable_detr/_deformable_detr_57c3d32.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6f389536870cd4acf36ab8f12d3b0bf9f847ec06e2cfc25905420796884b614e
-size 9907368
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/deformable_detr/_ops.py b/build/torch27-cxx11-cu128-x86_64-linux/deformable_detr/_ops.py
deleted file mode 100644
index 39c2aa7875432779e86612a0e56271fe32133953..0000000000000000000000000000000000000000
--- a/build/torch27-cxx11-cu128-x86_64-linux/deformable_detr/_ops.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import torch
-from . import _deformable_detr_57c3d32
-ops = torch.ops._deformable_detr_57c3d32
-
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_deformable_detr_57c3d32::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/deformable_detr/layers.py b/build/torch27-cxx11-cu128-x86_64-linux/deformable_detr/layers.py
deleted file mode 100644
index db94032dea3d445f27017f923ae80468e18d2d77..0000000000000000000000000000000000000000
--- a/build/torch27-cxx11-cu128-x86_64-linux/deformable_detr/layers.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from typing import List, Union, Tuple
-
-from torch import Tensor
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-import torch.nn as nn
-
-from ._ops import ops
-
-
-class MultiScaleDeformableAttentionFunction(Function):
-    @staticmethod
-    def forward(
-        context,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        context.im2col_step = im2col_step
-        output = ops.ms_deform_attn_forward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            context.im2col_step,
-        )
-        context.save_for_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        )
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(context, grad_output):
-        (
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        ) = context.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = ops.ms_deform_attn_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            grad_output,
-            context.im2col_step,
-        )
-
-        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
-
-
-class MultiScaleDeformableAttention(nn.Module):
-    def forward(
-        self,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_spatial_shapes_list: List[Tuple],
-        level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        return MultiScaleDeformableAttentionFunction.apply(
-            value,
-            value_spatial_shapes,
-            level_start_index,
-            sampling_locations,
-            attention_weights,
-            im2col_step,
-        )
-
-
-__all__ = ["MultiScaleDeformableAttention"]
diff --git a/build/torch28-cxx11-cu126-aarch64-linux/deformable_detr/__init__.py b/build/torch28-cxx11-cu126-aarch64-linux/deformable_detr/__init__.py
deleted file mode 100644
index 33db73ca6e361af4707ba5bb5f55bf0e7c3005a4..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu126-aarch64-linux/deformable_detr/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from typing import List
-import torch
-
-from ._ops import ops
-from . import layers
-
-
-def ms_deform_attn_backward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    grad_output: torch.Tensor,
-    im2col_step: int,
-) -> List[torch.Tensor]:
-    return ops.ms_deform_attn_backward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        grad_output,
-        im2col_step,
-    )
-
-
-def ms_deform_attn_forward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    im2col_step: int,
-) -> torch.Tensor:
-    return ops.ms_deform_attn_forward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        im2col_step,
-    )
-
-
-__all__ = ["layers", "ms_deform_attn_forward", "ms_deform_attn_backward"]
diff --git a/build/torch28-cxx11-cu126-aarch64-linux/deformable_detr/__pycache__/__init__.cpython-313.pyc b/build/torch28-cxx11-cu126-aarch64-linux/deformable_detr/__pycache__/__init__.cpython-313.pyc
deleted file mode 100644
index 664a914e405aa872821d228b552f22a99cb39d97..0000000000000000000000000000000000000000
Binary files a/build/torch28-cxx11-cu126-aarch64-linux/deformable_detr/__pycache__/__init__.cpython-313.pyc and /dev/null differ
diff --git a/build/torch28-cxx11-cu126-aarch64-linux/deformable_detr/__pycache__/_ops.cpython-313.pyc b/build/torch28-cxx11-cu126-aarch64-linux/deformable_detr/__pycache__/_ops.cpython-313.pyc
deleted file mode 100644
index 273cdfd52347ebcf0e0c050945d37c5dc5094a9a..0000000000000000000000000000000000000000
Binary files a/build/torch28-cxx11-cu126-aarch64-linux/deformable_detr/__pycache__/_ops.cpython-313.pyc and /dev/null differ
diff --git a/build/torch28-cxx11-cu126-aarch64-linux/deformable_detr/__pycache__/layers.cpython-313.pyc b/build/torch28-cxx11-cu126-aarch64-linux/deformable_detr/__pycache__/layers.cpython-313.pyc
deleted file mode 100644
index 082d05eb36d7ef831ca076e3e7d2eb070130c2ec..0000000000000000000000000000000000000000
Binary files a/build/torch28-cxx11-cu126-aarch64-linux/deformable_detr/__pycache__/layers.cpython-313.pyc and /dev/null differ
diff --git a/build/torch28-cxx11-cu126-aarch64-linux/deformable_detr/_deformable_detr_a92c8ea_dirty.abi3.so b/build/torch28-cxx11-cu126-aarch64-linux/deformable_detr/_deformable_detr_a92c8ea_dirty.abi3.so
deleted file mode 100755
index 3d206228684c315eb04160dc62c8a7cb6811156a..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu126-aarch64-linux/deformable_detr/_deformable_detr_a92c8ea_dirty.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b4a5665b05309312200ca97a80cc61340c0f5de123ab33254e5307a5ec4ed2a0
-size 6901024
diff --git a/build/torch28-cxx11-cu126-aarch64-linux/deformable_detr/_ops.py b/build/torch28-cxx11-cu126-aarch64-linux/deformable_detr/_ops.py
deleted file mode 100644
index 019cf7ce30dc11d9b791075404417b1ac47500e7..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu126-aarch64-linux/deformable_detr/_ops.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import torch
-from . import _deformable_detr_a92c8ea_dirty
-ops = torch.ops._deformable_detr_a92c8ea_dirty
-
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_deformable_detr_a92c8ea_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu126-aarch64-linux/deformable_detr/layers.py b/build/torch28-cxx11-cu126-aarch64-linux/deformable_detr/layers.py
deleted file mode 100644
index db94032dea3d445f27017f923ae80468e18d2d77..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu126-aarch64-linux/deformable_detr/layers.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from typing import List, Union, Tuple
-
-from torch import Tensor
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-import torch.nn as nn
-
-from ._ops import ops
-
-
-class MultiScaleDeformableAttentionFunction(Function):
-    @staticmethod
-    def forward(
-        context,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        context.im2col_step = im2col_step
-        output = ops.ms_deform_attn_forward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            context.im2col_step,
-        )
-        context.save_for_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        )
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(context, grad_output):
-        (
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        ) = context.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = ops.ms_deform_attn_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            grad_output,
-            context.im2col_step,
-        )
-
-        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
-
-
-class MultiScaleDeformableAttention(nn.Module):
-    def forward(
-        self,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_spatial_shapes_list: List[Tuple],
-        level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        return MultiScaleDeformableAttentionFunction.apply(
-            value,
-            value_spatial_shapes,
-            level_start_index,
-            sampling_locations,
-            attention_weights,
-            im2col_step,
-        )
-
-
-__all__ = ["MultiScaleDeformableAttention"]
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/__init__.py b/build/torch28-cxx11-cu126-x86_64-linux/__init__.py
deleted file mode 100644
index 33db73ca6e361af4707ba5bb5f55bf0e7c3005a4..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu126-x86_64-linux/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from typing import List
-import torch
-
-from ._ops import ops
-from . import layers
-
-
-def ms_deform_attn_backward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    grad_output: torch.Tensor,
-    im2col_step: int,
-) -> List[torch.Tensor]:
-    return ops.ms_deform_attn_backward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        grad_output,
-        im2col_step,
-    )
-
-
-def ms_deform_attn_forward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    im2col_step: int,
-) -> torch.Tensor:
-    return ops.ms_deform_attn_forward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        im2col_step,
-    )
-
-
-__all__ = ["layers", "ms_deform_attn_forward", "ms_deform_attn_backward"]
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/_deformable_detr_d7966ee.abi3.so b/build/torch28-cxx11-cu126-x86_64-linux/_deformable_detr_d7966ee.abi3.so
deleted file mode 100644
index 818f37b941ec19250adbb6feb357a0ec81c67b2a..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu126-x86_64-linux/_deformable_detr_d7966ee.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f8e98e8cdd688603d90d9bbc9fc7fd093d7c8f098ae239b33d59db563ca20d3f
-size 8535712
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/_ops.py b/build/torch28-cxx11-cu126-x86_64-linux/_ops.py
deleted file mode 100644
index b498ff9a9fd0bc22a42440001932cf97a8a9e955..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu126-x86_64-linux/_ops.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import torch
-from . import _deformable_detr_d7966ee
-ops = torch.ops._deformable_detr_d7966ee
-
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_deformable_detr_d7966ee::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/deformable_detr/__init__.py b/build/torch28-cxx11-cu126-x86_64-linux/deformable_detr/__init__.py
deleted file mode 100644
index 03dbc1afe1cf156661a2b1b22003cd5f599a0309..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu126-x86_64-linux/deformable_detr/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import ctypes
-import sys
-
-import importlib
-from pathlib import Path
-from types import ModuleType
-
-def _import_from_path(file_path: Path) -> ModuleType:
-    # We cannot use the module name as-is, after adding it to `sys.modules`,
-    # it would also be used for other imports. So, we make a module name that
-    # depends on the path for it to be unique using the hex-encoded hash of
-    # the path.
-    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
-    module_name = path_hash
-    spec = importlib.util.spec_from_file_location(module_name, file_path)
-    if spec is None:
-        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
-    module = importlib.util.module_from_spec(spec)
-    if module is None:
-        raise ImportError(f"Cannot load module {module_name} from spec")
-    sys.modules[module_name] = module
-    spec.loader.exec_module(module)  # type: ignore
-    return module
-
-
-globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/layers.py b/build/torch28-cxx11-cu126-x86_64-linux/layers.py
deleted file mode 100644
index db94032dea3d445f27017f923ae80468e18d2d77..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu126-x86_64-linux/layers.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from typing import List, Union, Tuple
-
-from torch import Tensor
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-import torch.nn as nn
-
-from ._ops import ops
-
-
-class MultiScaleDeformableAttentionFunction(Function):
-    @staticmethod
-    def forward(
-        context,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        context.im2col_step = im2col_step
-        output = ops.ms_deform_attn_forward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            context.im2col_step,
-        )
-        context.save_for_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        )
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(context, grad_output):
-        (
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        ) = context.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = ops.ms_deform_attn_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            grad_output,
-            context.im2col_step,
-        )
-
-        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
-
-
-class MultiScaleDeformableAttention(nn.Module):
-    def forward(
-        self,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_spatial_shapes_list: List[Tuple],
-        level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        return MultiScaleDeformableAttentionFunction.apply(
-            value,
-            value_spatial_shapes,
-            level_start_index,
-            sampling_locations,
-            attention_weights,
-            im2col_step,
-        )
-
-
-__all__ = ["MultiScaleDeformableAttention"]
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/metadata.json b/build/torch28-cxx11-cu126-x86_64-linux/metadata.json
deleted file mode 100644
index 9cf5deed9898dce769f4cc73913d3530b92a0bd8..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu126-x86_64-linux/metadata.json
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-  "version": 1,
-  "python-depends": []
-}
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu128-aarch64-linux/deformable_detr/__init__.py b/build/torch28-cxx11-cu128-aarch64-linux/deformable_detr/__init__.py
deleted file mode 100644
index 33db73ca6e361af4707ba5bb5f55bf0e7c3005a4..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu128-aarch64-linux/deformable_detr/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from typing import List
-import torch
-
-from ._ops import ops
-from . import layers
-
-
-def ms_deform_attn_backward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    grad_output: torch.Tensor,
-    im2col_step: int,
-) -> List[torch.Tensor]:
-    return ops.ms_deform_attn_backward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        grad_output,
-        im2col_step,
-    )
-
-
-def ms_deform_attn_forward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    im2col_step: int,
-) -> torch.Tensor:
-    return ops.ms_deform_attn_forward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        im2col_step,
-    )
-
-
-__all__ = ["layers", "ms_deform_attn_forward", "ms_deform_attn_backward"]
diff --git a/build/torch28-cxx11-cu128-aarch64-linux/deformable_detr/__pycache__/__init__.cpython-313.pyc b/build/torch28-cxx11-cu128-aarch64-linux/deformable_detr/__pycache__/__init__.cpython-313.pyc
deleted file mode 100644
index 28a5933156ae99fa286e692f8e98dd04e50f5174..0000000000000000000000000000000000000000
Binary files a/build/torch28-cxx11-cu128-aarch64-linux/deformable_detr/__pycache__/__init__.cpython-313.pyc and /dev/null differ
diff --git a/build/torch28-cxx11-cu128-aarch64-linux/deformable_detr/__pycache__/_ops.cpython-313.pyc b/build/torch28-cxx11-cu128-aarch64-linux/deformable_detr/__pycache__/_ops.cpython-313.pyc
deleted file mode 100644
index 0009fad4a606c6aacd228a2d7af5e6263d47d861..0000000000000000000000000000000000000000
Binary files a/build/torch28-cxx11-cu128-aarch64-linux/deformable_detr/__pycache__/_ops.cpython-313.pyc and /dev/null differ
diff --git a/build/torch28-cxx11-cu128-aarch64-linux/deformable_detr/__pycache__/layers.cpython-313.pyc b/build/torch28-cxx11-cu128-aarch64-linux/deformable_detr/__pycache__/layers.cpython-313.pyc
deleted file mode 100644
index 18befc2ceb219257a717fe3c9800618101a3904c..0000000000000000000000000000000000000000
Binary files a/build/torch28-cxx11-cu128-aarch64-linux/deformable_detr/__pycache__/layers.cpython-313.pyc and /dev/null differ
diff --git a/build/torch28-cxx11-cu128-aarch64-linux/deformable_detr/_deformable_detr_a92c8ea_dirty.abi3.so b/build/torch28-cxx11-cu128-aarch64-linux/deformable_detr/_deformable_detr_a92c8ea_dirty.abi3.so
deleted file mode 100755
index 7a0b563d8285a538c60f993ba90ffee3d5b49ec6..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu128-aarch64-linux/deformable_detr/_deformable_detr_a92c8ea_dirty.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3f4a0a7850968822e26e3a59c801fd711231d5294193155efbf9583761e114ef
-size 9849688
diff --git a/build/torch28-cxx11-cu128-aarch64-linux/deformable_detr/_ops.py b/build/torch28-cxx11-cu128-aarch64-linux/deformable_detr/_ops.py
deleted file mode 100644
index 019cf7ce30dc11d9b791075404417b1ac47500e7..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu128-aarch64-linux/deformable_detr/_ops.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import torch
-from . import _deformable_detr_a92c8ea_dirty
-ops = torch.ops._deformable_detr_a92c8ea_dirty
-
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_deformable_detr_a92c8ea_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu128-aarch64-linux/deformable_detr/layers.py b/build/torch28-cxx11-cu128-aarch64-linux/deformable_detr/layers.py
deleted file mode 100644
index db94032dea3d445f27017f923ae80468e18d2d77..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu128-aarch64-linux/deformable_detr/layers.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from typing import List, Union, Tuple
-
-from torch import Tensor
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-import torch.nn as nn
-
-from ._ops import ops
-
-
-class MultiScaleDeformableAttentionFunction(Function):
-    @staticmethod
-    def forward(
-        context,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        context.im2col_step = im2col_step
-        output = ops.ms_deform_attn_forward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            context.im2col_step,
-        )
-        context.save_for_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        )
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(context, grad_output):
-        (
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        ) = context.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = ops.ms_deform_attn_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            grad_output,
-            context.im2col_step,
-        )
-
-        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
-
-
-class MultiScaleDeformableAttention(nn.Module):
-    def forward(
-        self,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_spatial_shapes_list: List[Tuple],
-        level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        return MultiScaleDeformableAttentionFunction.apply(
-            value,
-            value_spatial_shapes,
-            level_start_index,
-            sampling_locations,
-            attention_weights,
-            im2col_step,
-        )
-
-
-__all__ = ["MultiScaleDeformableAttention"]
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/__init__.py b/build/torch28-cxx11-cu128-x86_64-linux/__init__.py
deleted file mode 100644
index 33db73ca6e361af4707ba5bb5f55bf0e7c3005a4..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu128-x86_64-linux/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from typing import List
-import torch
-
-from ._ops import ops
-from . import layers
-
-
-def ms_deform_attn_backward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    grad_output: torch.Tensor,
-    im2col_step: int,
-) -> List[torch.Tensor]:
-    return ops.ms_deform_attn_backward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        grad_output,
-        im2col_step,
-    )
-
-
-def ms_deform_attn_forward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    im2col_step: int,
-) -> torch.Tensor:
-    return ops.ms_deform_attn_forward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        im2col_step,
-    )
-
-
-__all__ = ["layers", "ms_deform_attn_forward", "ms_deform_attn_backward"]
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/_deformable_detr_d7966ee.abi3.so b/build/torch28-cxx11-cu128-x86_64-linux/_deformable_detr_d7966ee.abi3.so
deleted file mode 100644
index 2118be1b65bcb2c028582b19fa63835513865bf1..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu128-x86_64-linux/_deformable_detr_d7966ee.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:615c916fe00481be53757d381f62c663c3519bf5d0dda09514b13bf9e493b807
-size 11523184
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/_ops.py b/build/torch28-cxx11-cu128-x86_64-linux/_ops.py
deleted file mode 100644
index b498ff9a9fd0bc22a42440001932cf97a8a9e955..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu128-x86_64-linux/_ops.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import torch
-from . import _deformable_detr_d7966ee
-ops = torch.ops._deformable_detr_d7966ee
-
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_deformable_detr_d7966ee::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/deformable_detr/__init__.py b/build/torch28-cxx11-cu128-x86_64-linux/deformable_detr/__init__.py
deleted file mode 100644
index 03dbc1afe1cf156661a2b1b22003cd5f599a0309..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu128-x86_64-linux/deformable_detr/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import ctypes
-import sys
-
-import importlib
-from pathlib import Path
-from types import ModuleType
-
-def _import_from_path(file_path: Path) -> ModuleType:
-    # We cannot use the module name as-is, after adding it to `sys.modules`,
-    # it would also be used for other imports. So, we make a module name that
-    # depends on the path for it to be unique using the hex-encoded hash of
-    # the path.
-    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
-    module_name = path_hash
-    spec = importlib.util.spec_from_file_location(module_name, file_path)
-    if spec is None:
-        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
-    module = importlib.util.module_from_spec(spec)
-    if module is None:
-        raise ImportError(f"Cannot load module {module_name} from spec")
-    sys.modules[module_name] = module
-    spec.loader.exec_module(module)  # type: ignore
-    return module
-
-
-globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/layers.py b/build/torch28-cxx11-cu128-x86_64-linux/layers.py
deleted file mode 100644
index db94032dea3d445f27017f923ae80468e18d2d77..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu128-x86_64-linux/layers.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from typing import List, Union, Tuple
-
-from torch import Tensor
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-import torch.nn as nn
-
-from ._ops import ops
-
-
-class MultiScaleDeformableAttentionFunction(Function):
-    @staticmethod
-    def forward(
-        context,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        context.im2col_step = im2col_step
-        output = ops.ms_deform_attn_forward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            context.im2col_step,
-        )
-        context.save_for_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        )
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(context, grad_output):
-        (
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        ) = context.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = ops.ms_deform_attn_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            grad_output,
-            context.im2col_step,
-        )
-
-        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
-
-
-class MultiScaleDeformableAttention(nn.Module):
-    def forward(
-        self,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_spatial_shapes_list: List[Tuple],
-        level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        return MultiScaleDeformableAttentionFunction.apply(
-            value,
-            value_spatial_shapes,
-            level_start_index,
-            sampling_locations,
-            attention_weights,
-            im2col_step,
-        )
-
-
-__all__ = ["MultiScaleDeformableAttention"]
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/metadata.json b/build/torch28-cxx11-cu128-x86_64-linux/metadata.json
deleted file mode 100644
index 9cf5deed9898dce769f4cc73913d3530b92a0bd8..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu128-x86_64-linux/metadata.json
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-  "version": 1,
-  "python-depends": []
-}
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu129-aarch64-linux/deformable_detr/__init__.py b/build/torch28-cxx11-cu129-aarch64-linux/deformable_detr/__init__.py
deleted file mode 100644
index 33db73ca6e361af4707ba5bb5f55bf0e7c3005a4..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu129-aarch64-linux/deformable_detr/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from typing import List
-import torch
-
-from ._ops import ops
-from . import layers
-
-
-def ms_deform_attn_backward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    grad_output: torch.Tensor,
-    im2col_step: int,
-) -> List[torch.Tensor]:
-    return ops.ms_deform_attn_backward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        grad_output,
-        im2col_step,
-    )
-
-
-def ms_deform_attn_forward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    im2col_step: int,
-) -> torch.Tensor:
-    return ops.ms_deform_attn_forward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        im2col_step,
-    )
-
-
-__all__ = ["layers", "ms_deform_attn_forward", "ms_deform_attn_backward"]
diff --git a/build/torch28-cxx11-cu129-aarch64-linux/deformable_detr/__pycache__/__init__.cpython-313.pyc b/build/torch28-cxx11-cu129-aarch64-linux/deformable_detr/__pycache__/__init__.cpython-313.pyc
deleted file mode 100644
index 3455ebe146d88ed909d4d3a22cbafdb7062e1a5c..0000000000000000000000000000000000000000
Binary files a/build/torch28-cxx11-cu129-aarch64-linux/deformable_detr/__pycache__/__init__.cpython-313.pyc and /dev/null differ
diff --git a/build/torch28-cxx11-cu129-aarch64-linux/deformable_detr/__pycache__/_ops.cpython-313.pyc b/build/torch28-cxx11-cu129-aarch64-linux/deformable_detr/__pycache__/_ops.cpython-313.pyc
deleted file mode 100644
index 6d0da2a8b90739e30e9e2787462994764ddd0e6b..0000000000000000000000000000000000000000
Binary files a/build/torch28-cxx11-cu129-aarch64-linux/deformable_detr/__pycache__/_ops.cpython-313.pyc and /dev/null differ
diff --git a/build/torch28-cxx11-cu129-aarch64-linux/deformable_detr/__pycache__/layers.cpython-313.pyc b/build/torch28-cxx11-cu129-aarch64-linux/deformable_detr/__pycache__/layers.cpython-313.pyc
deleted file mode 100644
index f35af2756ff4a30e7bd8d95092215bc26766097d..0000000000000000000000000000000000000000
Binary files a/build/torch28-cxx11-cu129-aarch64-linux/deformable_detr/__pycache__/layers.cpython-313.pyc and /dev/null differ
diff --git a/build/torch28-cxx11-cu129-aarch64-linux/deformable_detr/_deformable_detr_320b408.abi3.so b/build/torch28-cxx11-cu129-aarch64-linux/deformable_detr/_deformable_detr_320b408.abi3.so
deleted file mode 100644
index fe1c2e4e6b379f20da5c38cee026aac074b71fb4..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu129-aarch64-linux/deformable_detr/_deformable_detr_320b408.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c7f99924b14f0522d25c5f0a307ab1364a3f76d9ecf29684f80aa388f6bd443b
-size 10047704
diff --git a/build/torch28-cxx11-cu129-aarch64-linux/deformable_detr/_ops.py b/build/torch28-cxx11-cu129-aarch64-linux/deformable_detr/_ops.py
deleted file mode 100644
index 155c52bfca54e55425b639314b289e668c5a6ec2..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu129-aarch64-linux/deformable_detr/_ops.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import torch
-from . import _deformable_detr_320b408
-ops = torch.ops._deformable_detr_320b408
-
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_deformable_detr_320b408::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu129-aarch64-linux/deformable_detr/layers.py b/build/torch28-cxx11-cu129-aarch64-linux/deformable_detr/layers.py
deleted file mode 100644
index db94032dea3d445f27017f923ae80468e18d2d77..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu129-aarch64-linux/deformable_detr/layers.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from typing import List, Union, Tuple
-
-from torch import Tensor
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-import torch.nn as nn
-
-from ._ops import ops
-
-
-class MultiScaleDeformableAttentionFunction(Function):
-    @staticmethod
-    def forward(
-        context,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        context.im2col_step = im2col_step
-        output = ops.ms_deform_attn_forward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            context.im2col_step,
-        )
-        context.save_for_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        )
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(context, grad_output):
-        (
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        ) = context.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = ops.ms_deform_attn_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            grad_output,
-            context.im2col_step,
-        )
-
-        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
-
-
-class MultiScaleDeformableAttention(nn.Module):
-    def forward(
-        self,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_spatial_shapes_list: List[Tuple],
-        level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        return MultiScaleDeformableAttentionFunction.apply(
-            value,
-            value_spatial_shapes,
-            level_start_index,
-            sampling_locations,
-            attention_weights,
-            im2col_step,
-        )
-
-
-__all__ = ["MultiScaleDeformableAttention"]
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/__init__.py b/build/torch28-cxx11-cu129-x86_64-linux/__init__.py
deleted file mode 100644
index 33db73ca6e361af4707ba5bb5f55bf0e7c3005a4..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu129-x86_64-linux/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from typing import List
-import torch
-
-from ._ops import ops
-from . import layers
-
-
-def ms_deform_attn_backward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    grad_output: torch.Tensor,
-    im2col_step: int,
-) -> List[torch.Tensor]:
-    return ops.ms_deform_attn_backward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        grad_output,
-        im2col_step,
-    )
-
-
-def ms_deform_attn_forward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    im2col_step: int,
-) -> torch.Tensor:
-    return ops.ms_deform_attn_forward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        im2col_step,
-    )
-
-
-__all__ = ["layers", "ms_deform_attn_forward", "ms_deform_attn_backward"]
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/_deformable_detr_d7966ee.abi3.so b/build/torch28-cxx11-cu129-x86_64-linux/_deformable_detr_d7966ee.abi3.so
deleted file mode 100644
index be60b895fec86b2ac0f8f4387d10002f63cf4c8f..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu129-x86_64-linux/_deformable_detr_d7966ee.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4b38ea9a577233aee07a6417d5651076d27940a1a5b9b7edb89d1745a885b071
-size 11581544
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/_ops.py b/build/torch28-cxx11-cu129-x86_64-linux/_ops.py
deleted file mode 100644
index b498ff9a9fd0bc22a42440001932cf97a8a9e955..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu129-x86_64-linux/_ops.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import torch
-from . import _deformable_detr_d7966ee
-ops = torch.ops._deformable_detr_d7966ee
-
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_deformable_detr_d7966ee::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/deformable_detr/__init__.py b/build/torch28-cxx11-cu129-x86_64-linux/deformable_detr/__init__.py
deleted file mode 100644
index 03dbc1afe1cf156661a2b1b22003cd5f599a0309..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu129-x86_64-linux/deformable_detr/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import ctypes
-import sys
-
-import importlib
-from pathlib import Path
-from types import ModuleType
-
-def _import_from_path(file_path: Path) -> ModuleType:
-    # We cannot use the module name as-is, after adding it to `sys.modules`,
-    # it would also be used for other imports. So, we make a module name that
-    # depends on the path for it to be unique using the hex-encoded hash of
-    # the path.
-    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
-    module_name = path_hash
-    spec = importlib.util.spec_from_file_location(module_name, file_path)
-    if spec is None:
-        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
-    module = importlib.util.module_from_spec(spec)
-    if module is None:
-        raise ImportError(f"Cannot load module {module_name} from spec")
-    sys.modules[module_name] = module
-    spec.loader.exec_module(module)  # type: ignore
-    return module
-
-
-globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/layers.py b/build/torch28-cxx11-cu129-x86_64-linux/layers.py
deleted file mode 100644
index db94032dea3d445f27017f923ae80468e18d2d77..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu129-x86_64-linux/layers.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from typing import List, Union, Tuple
-
-from torch import Tensor
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-import torch.nn as nn
-
-from ._ops import ops
-
-
-class MultiScaleDeformableAttentionFunction(Function):
-    @staticmethod
-    def forward(
-        context,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        context.im2col_step = im2col_step
-        output = ops.ms_deform_attn_forward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            context.im2col_step,
-        )
-        context.save_for_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        )
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(context, grad_output):
-        (
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        ) = context.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = ops.ms_deform_attn_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            grad_output,
-            context.im2col_step,
-        )
-
-        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
-
-
-class MultiScaleDeformableAttention(nn.Module):
-    def forward(
-        self,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_spatial_shapes_list: List[Tuple],
-        level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        return MultiScaleDeformableAttentionFunction.apply(
-            value,
-            value_spatial_shapes,
-            level_start_index,
-            sampling_locations,
-            attention_weights,
-            im2col_step,
-        )
-
-
-__all__ = ["MultiScaleDeformableAttention"]
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/metadata.json b/build/torch28-cxx11-cu129-x86_64-linux/metadata.json
deleted file mode 100644
index 9cf5deed9898dce769f4cc73913d3530b92a0bd8..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu129-x86_64-linux/metadata.json
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-  "version": 1,
-  "python-depends": []
-}
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/deformable_detr/__init__.py b/build/torch29-cxx11-cu126-aarch64-linux/deformable_detr/__init__.py
deleted file mode 100644
index 33db73ca6e361af4707ba5bb5f55bf0e7c3005a4..0000000000000000000000000000000000000000
--- a/build/torch29-cxx11-cu126-aarch64-linux/deformable_detr/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from typing import List
-import torch
-
-from ._ops import ops
-from . import layers
-
-
-def ms_deform_attn_backward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    grad_output: torch.Tensor,
-    im2col_step: int,
-) -> List[torch.Tensor]:
-    return ops.ms_deform_attn_backward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        grad_output,
-        im2col_step,
-    )
-
-
-def ms_deform_attn_forward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    im2col_step: int,
-) -> torch.Tensor:
-    return ops.ms_deform_attn_forward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        im2col_step,
-    )
-
-
-__all__ = ["layers", "ms_deform_attn_forward", "ms_deform_attn_backward"]
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/deformable_detr/__pycache__/__init__.cpython-313.pyc b/build/torch29-cxx11-cu126-aarch64-linux/deformable_detr/__pycache__/__init__.cpython-313.pyc
deleted file mode 100644
index a1983217c07b8aa9308afcec19e158861a1e8d25..0000000000000000000000000000000000000000
Binary files a/build/torch29-cxx11-cu126-aarch64-linux/deformable_detr/__pycache__/__init__.cpython-313.pyc and /dev/null differ
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/deformable_detr/__pycache__/_ops.cpython-313.pyc b/build/torch29-cxx11-cu126-aarch64-linux/deformable_detr/__pycache__/_ops.cpython-313.pyc
deleted file mode 100644
index d463597489341a5f4ff3f5fb13144337e67fecf2..0000000000000000000000000000000000000000
Binary files a/build/torch29-cxx11-cu126-aarch64-linux/deformable_detr/__pycache__/_ops.cpython-313.pyc and /dev/null differ
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/deformable_detr/__pycache__/layers.cpython-313.pyc b/build/torch29-cxx11-cu126-aarch64-linux/deformable_detr/__pycache__/layers.cpython-313.pyc
deleted file mode 100644
index 93cc9e1bf37fad1569d77d219152a2a9d21967b9..0000000000000000000000000000000000000000
Binary files a/build/torch29-cxx11-cu126-aarch64-linux/deformable_detr/__pycache__/layers.cpython-313.pyc and /dev/null differ
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/deformable_detr/_deformable_detr_320b408.abi3.so b/build/torch29-cxx11-cu126-aarch64-linux/deformable_detr/_deformable_detr_320b408.abi3.so
deleted file mode 100644
index 318dcfa2068b3b2f569cb7043345b696c9759c7b..0000000000000000000000000000000000000000
--- a/build/torch29-cxx11-cu126-aarch64-linux/deformable_detr/_deformable_detr_320b408.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e471a8e5692e1fb09dc5750fd1c76031fdb4e166082a70c9c248aa7b3a2388ca
-size 6966520
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/deformable_detr/_ops.py b/build/torch29-cxx11-cu126-aarch64-linux/deformable_detr/_ops.py
deleted file mode 100644
index 155c52bfca54e55425b639314b289e668c5a6ec2..0000000000000000000000000000000000000000
--- a/build/torch29-cxx11-cu126-aarch64-linux/deformable_detr/_ops.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import torch
-from . import _deformable_detr_320b408
-ops = torch.ops._deformable_detr_320b408
-
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_deformable_detr_320b408::{op_name}"
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/deformable_detr/layers.py b/build/torch29-cxx11-cu126-aarch64-linux/deformable_detr/layers.py
deleted file mode 100644
index db94032dea3d445f27017f923ae80468e18d2d77..0000000000000000000000000000000000000000
--- a/build/torch29-cxx11-cu126-aarch64-linux/deformable_detr/layers.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from typing import List, Union, Tuple
-
-from torch import Tensor
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-import torch.nn as nn
-
-from ._ops import ops
-
-
-class MultiScaleDeformableAttentionFunction(Function):
-    @staticmethod
-    def forward(
-        context,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        context.im2col_step = im2col_step
-        output = ops.ms_deform_attn_forward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            context.im2col_step,
-        )
-        context.save_for_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        )
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(context, grad_output):
-        (
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        ) = context.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = ops.ms_deform_attn_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            grad_output,
-            context.im2col_step,
-        )
-
-        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
-
-
-class MultiScaleDeformableAttention(nn.Module):
-    def forward(
-        self,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_spatial_shapes_list: List[Tuple],
-        level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        return MultiScaleDeformableAttentionFunction.apply(
-            value,
-            value_spatial_shapes,
-            level_start_index,
-            sampling_locations,
-            attention_weights,
-            im2col_step,
-        )
-
-
-__all__ = ["MultiScaleDeformableAttention"]
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/__init__.py b/build/torch29-cxx11-cu126-x86_64-linux/__init__.py
deleted file mode 100644
index 33db73ca6e361af4707ba5bb5f55bf0e7c3005a4..0000000000000000000000000000000000000000
--- a/build/torch29-cxx11-cu126-x86_64-linux/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from typing import List
-import torch
-
-from ._ops import ops
-from . import layers
-
-
-def ms_deform_attn_backward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    grad_output: torch.Tensor,
-    im2col_step: int,
-) -> List[torch.Tensor]:
-    return ops.ms_deform_attn_backward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        grad_output,
-        im2col_step,
-    )
-
-
-def ms_deform_attn_forward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    im2col_step: int,
-) -> torch.Tensor:
-    return ops.ms_deform_attn_forward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        im2col_step,
-    )
-
-
-__all__ = ["layers", "ms_deform_attn_forward", "ms_deform_attn_backward"]
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/_deformable_detr_d7966ee.abi3.so b/build/torch29-cxx11-cu126-x86_64-linux/_deformable_detr_d7966ee.abi3.so
deleted file mode 100644
index 5a9f7e9164902ca219ec4801371409e661802ce8..0000000000000000000000000000000000000000
--- a/build/torch29-cxx11-cu126-x86_64-linux/_deformable_detr_d7966ee.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a5753b17562ea387d03d7eb7a2a5c1a83d5ab86de09b9f9ae193a4c0c03e5c98
-size 8535872
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/_ops.py b/build/torch29-cxx11-cu126-x86_64-linux/_ops.py
deleted file mode 100644
index b498ff9a9fd0bc22a42440001932cf97a8a9e955..0000000000000000000000000000000000000000
--- a/build/torch29-cxx11-cu126-x86_64-linux/_ops.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import torch
-from . import _deformable_detr_d7966ee
-ops = torch.ops._deformable_detr_d7966ee
-
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_deformable_detr_d7966ee::{op_name}"
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/deformable_detr/__init__.py b/build/torch29-cxx11-cu126-x86_64-linux/deformable_detr/__init__.py
deleted file mode 100644
index 03dbc1afe1cf156661a2b1b22003cd5f599a0309..0000000000000000000000000000000000000000
--- a/build/torch29-cxx11-cu126-x86_64-linux/deformable_detr/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import ctypes
-import sys
-
-import importlib
-from pathlib import Path
-from types import ModuleType
-
-def _import_from_path(file_path: Path) -> ModuleType:
-    # We cannot use the module name as-is, after adding it to `sys.modules`,
-    # it would also be used for other imports. So, we make a module name that
-    # depends on the path for it to be unique using the hex-encoded hash of
-    # the path.
-    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
-    module_name = path_hash
-    spec = importlib.util.spec_from_file_location(module_name, file_path)
-    if spec is None:
-        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
-    module = importlib.util.module_from_spec(spec)
-    if module is None:
-        raise ImportError(f"Cannot load module {module_name} from spec")
-    sys.modules[module_name] = module
-    spec.loader.exec_module(module)  # type: ignore
-    return module
-
-
-globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/layers.py b/build/torch29-cxx11-cu126-x86_64-linux/layers.py
deleted file mode 100644
index db94032dea3d445f27017f923ae80468e18d2d77..0000000000000000000000000000000000000000
--- a/build/torch29-cxx11-cu126-x86_64-linux/layers.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from typing import List, Union, Tuple
-
-from torch import Tensor
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-import torch.nn as nn
-
-from ._ops import ops
-
-
-class MultiScaleDeformableAttentionFunction(Function):
-    @staticmethod
-    def forward(
-        context,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        context.im2col_step = im2col_step
-        output = ops.ms_deform_attn_forward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            context.im2col_step,
-        )
-        context.save_for_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        )
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(context, grad_output):
-        (
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        ) = context.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = ops.ms_deform_attn_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            grad_output,
-            context.im2col_step,
-        )
-
-        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
-
-
-class MultiScaleDeformableAttention(nn.Module):
-    def forward(
-        self,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_spatial_shapes_list: List[Tuple],
-        level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        return MultiScaleDeformableAttentionFunction.apply(
-            value,
-            value_spatial_shapes,
-            level_start_index,
-            sampling_locations,
-            attention_weights,
-            im2col_step,
-        )
-
-
-__all__ = ["MultiScaleDeformableAttention"]
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/metadata.json b/build/torch29-cxx11-cu126-x86_64-linux/metadata.json
deleted file mode 100644
index 9cf5deed9898dce769f4cc73913d3530b92a0bd8..0000000000000000000000000000000000000000
--- a/build/torch29-cxx11-cu126-x86_64-linux/metadata.json
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-  "version": 1,
-  "python-depends": []
-}
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/deformable_detr/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/deformable_detr/__init__.py
deleted file mode 100644
index 33db73ca6e361af4707ba5bb5f55bf0e7c3005a4..0000000000000000000000000000000000000000
--- a/build/torch29-cxx11-cu128-aarch64-linux/deformable_detr/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from typing import List
-import torch
-
-from ._ops import ops
-from . import layers
-
-
-def ms_deform_attn_backward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    grad_output: torch.Tensor,
-    im2col_step: int,
-) -> List[torch.Tensor]:
-    return ops.ms_deform_attn_backward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        grad_output,
-        im2col_step,
-    )
-
-
-def ms_deform_attn_forward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    im2col_step: int,
-) -> torch.Tensor:
-    return ops.ms_deform_attn_forward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        im2col_step,
-    )
-
-
-__all__ = ["layers", "ms_deform_attn_forward", "ms_deform_attn_backward"]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/deformable_detr/__pycache__/__init__.cpython-313.pyc b/build/torch29-cxx11-cu128-aarch64-linux/deformable_detr/__pycache__/__init__.cpython-313.pyc
deleted file mode 100644
index ea791135954db5f79bf25035aa2bbb98d9011463..0000000000000000000000000000000000000000
Binary files a/build/torch29-cxx11-cu128-aarch64-linux/deformable_detr/__pycache__/__init__.cpython-313.pyc and /dev/null differ
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/deformable_detr/__pycache__/_ops.cpython-313.pyc b/build/torch29-cxx11-cu128-aarch64-linux/deformable_detr/__pycache__/_ops.cpython-313.pyc
deleted file mode 100644
index 9675af8cf30cd0915c1edaf0242f47fc4d5e9ea8..0000000000000000000000000000000000000000
Binary files a/build/torch29-cxx11-cu128-aarch64-linux/deformable_detr/__pycache__/_ops.cpython-313.pyc and /dev/null differ
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/deformable_detr/__pycache__/layers.cpython-313.pyc b/build/torch29-cxx11-cu128-aarch64-linux/deformable_detr/__pycache__/layers.cpython-313.pyc
deleted file mode 100644
index 002a210cf6e22da73a099ad54f50b5d7a825c795..0000000000000000000000000000000000000000
Binary files a/build/torch29-cxx11-cu128-aarch64-linux/deformable_detr/__pycache__/layers.cpython-313.pyc and /dev/null differ
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/deformable_detr/_deformable_detr_320b408.abi3.so b/build/torch29-cxx11-cu128-aarch64-linux/deformable_detr/_deformable_detr_320b408.abi3.so
deleted file mode 100644
index d6b932fb9f6069676958572efb938f0ad7d94687..0000000000000000000000000000000000000000
--- a/build/torch29-cxx11-cu128-aarch64-linux/deformable_detr/_deformable_detr_320b408.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b54665ded7f4ed9bdedb765f3010a52ede13f0b9107aae15ae268f08dd171d21
-size 9980808
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/deformable_detr/_ops.py b/build/torch29-cxx11-cu128-aarch64-linux/deformable_detr/_ops.py
deleted file mode 100644
index 155c52bfca54e55425b639314b289e668c5a6ec2..0000000000000000000000000000000000000000
--- a/build/torch29-cxx11-cu128-aarch64-linux/deformable_detr/_ops.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import torch
-from . import _deformable_detr_320b408
-ops = torch.ops._deformable_detr_320b408
-
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_deformable_detr_320b408::{op_name}"
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/deformable_detr/layers.py b/build/torch29-cxx11-cu128-aarch64-linux/deformable_detr/layers.py
deleted file mode 100644
index db94032dea3d445f27017f923ae80468e18d2d77..0000000000000000000000000000000000000000
--- a/build/torch29-cxx11-cu128-aarch64-linux/deformable_detr/layers.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from typing import List, Union, Tuple
-
-from torch import Tensor
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-import torch.nn as nn
-
-from ._ops import ops
-
-
-class MultiScaleDeformableAttentionFunction(Function):
-    @staticmethod
-    def forward(
-        context,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        context.im2col_step = im2col_step
-        output = ops.ms_deform_attn_forward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            context.im2col_step,
-        )
-        context.save_for_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        )
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(context, grad_output):
-        (
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        ) = context.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = ops.ms_deform_attn_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            grad_output,
-            context.im2col_step,
-        )
-
-        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
-
-
-class MultiScaleDeformableAttention(nn.Module):
-    def forward(
-        self,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_spatial_shapes_list: List[Tuple],
-        level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        return MultiScaleDeformableAttentionFunction.apply(
-            value,
-            value_spatial_shapes,
-            level_start_index,
-            sampling_locations,
-            attention_weights,
-            im2col_step,
-        )
-
-
-__all__ = ["MultiScaleDeformableAttention"]
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/__init__.py b/build/torch29-cxx11-cu128-x86_64-linux/__init__.py
deleted file mode 100644
index 33db73ca6e361af4707ba5bb5f55bf0e7c3005a4..0000000000000000000000000000000000000000
--- a/build/torch29-cxx11-cu128-x86_64-linux/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from typing import List
-import torch
-
-from ._ops import ops
-from . import layers
-
-
-def ms_deform_attn_backward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    grad_output: torch.Tensor,
-    im2col_step: int,
-) -> List[torch.Tensor]:
-    return ops.ms_deform_attn_backward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        grad_output,
-        im2col_step,
-    )
-
-
-def ms_deform_attn_forward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    im2col_step: int,
-) -> torch.Tensor:
-    return ops.ms_deform_attn_forward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        im2col_step,
-    )
-
-
-__all__ = ["layers", "ms_deform_attn_forward", "ms_deform_attn_backward"]
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/_deformable_detr_d7966ee.abi3.so b/build/torch29-cxx11-cu128-x86_64-linux/_deformable_detr_d7966ee.abi3.so
deleted file mode 100644
index 9ae667b52286127cbb322e067bd3b60a90df15df..0000000000000000000000000000000000000000
--- a/build/torch29-cxx11-cu128-x86_64-linux/_deformable_detr_d7966ee.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d06340b35f6409e698a981a3b8503a0348ce2878111b0928719294de88042b72
-size 11519248
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/_ops.py b/build/torch29-cxx11-cu128-x86_64-linux/_ops.py
deleted file mode 100644
index b498ff9a9fd0bc22a42440001932cf97a8a9e955..0000000000000000000000000000000000000000
--- a/build/torch29-cxx11-cu128-x86_64-linux/_ops.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import torch
-from . import _deformable_detr_d7966ee
-ops = torch.ops._deformable_detr_d7966ee
-
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_deformable_detr_d7966ee::{op_name}"
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/deformable_detr/__init__.py b/build/torch29-cxx11-cu128-x86_64-linux/deformable_detr/__init__.py
deleted file mode 100644
index 03dbc1afe1cf156661a2b1b22003cd5f599a0309..0000000000000000000000000000000000000000
--- a/build/torch29-cxx11-cu128-x86_64-linux/deformable_detr/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import ctypes
-import sys
-
-import importlib
-from pathlib import Path
-from types import ModuleType
-
-def _import_from_path(file_path: Path) -> ModuleType:
-    # We cannot use the module name as-is, after adding it to `sys.modules`,
-    # it would also be used for other imports. So, we make a module name that
-    # depends on the path for it to be unique using the hex-encoded hash of
-    # the path.
-    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
-    module_name = path_hash
-    spec = importlib.util.spec_from_file_location(module_name, file_path)
-    if spec is None:
-        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
-    module = importlib.util.module_from_spec(spec)
-    if module is None:
-        raise ImportError(f"Cannot load module {module_name} from spec")
-    sys.modules[module_name] = module
-    spec.loader.exec_module(module)  # type: ignore
-    return module
-
-
-globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/layers.py b/build/torch29-cxx11-cu128-x86_64-linux/layers.py
deleted file mode 100644
index db94032dea3d445f27017f923ae80468e18d2d77..0000000000000000000000000000000000000000
--- a/build/torch29-cxx11-cu128-x86_64-linux/layers.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from typing import List, Union, Tuple
-
-from torch import Tensor
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-import torch.nn as nn
-
-from ._ops import ops
-
-
-class MultiScaleDeformableAttentionFunction(Function):
-    @staticmethod
-    def forward(
-        context,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        context.im2col_step = im2col_step
-        output = ops.ms_deform_attn_forward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            context.im2col_step,
-        )
-        context.save_for_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        )
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(context, grad_output):
-        (
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        ) = context.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = ops.ms_deform_attn_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            grad_output,
-            context.im2col_step,
-        )
-
-        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
-
-
-class MultiScaleDeformableAttention(nn.Module):
-    def forward(
-        self,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_spatial_shapes_list: List[Tuple],
-        level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        return MultiScaleDeformableAttentionFunction.apply(
-            value,
-            value_spatial_shapes,
-            level_start_index,
-            sampling_locations,
-            attention_weights,
-            im2col_step,
-        )
-
-
-__all__ = ["MultiScaleDeformableAttention"]
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/metadata.json b/build/torch29-cxx11-cu128-x86_64-linux/metadata.json
deleted file mode 100644
index 9cf5deed9898dce769f4cc73913d3530b92a0bd8..0000000000000000000000000000000000000000
--- a/build/torch29-cxx11-cu128-x86_64-linux/metadata.json
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-  "version": 1,
-  "python-depends": []
-}
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/deformable_detr/__init__.py b/build/torch29-cxx11-cu130-aarch64-linux/deformable_detr/__init__.py
deleted file mode 100644
index 33db73ca6e361af4707ba5bb5f55bf0e7c3005a4..0000000000000000000000000000000000000000
--- a/build/torch29-cxx11-cu130-aarch64-linux/deformable_detr/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from typing import List
-import torch
-
-from ._ops import ops
-from . import layers
-
-
-def ms_deform_attn_backward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    grad_output: torch.Tensor,
-    im2col_step: int,
-) -> List[torch.Tensor]:
-    return ops.ms_deform_attn_backward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        grad_output,
-        im2col_step,
-    )
-
-
-def ms_deform_attn_forward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    im2col_step: int,
-) -> torch.Tensor:
-    return ops.ms_deform_attn_forward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        im2col_step,
-    )
-
-
-__all__ = ["layers", "ms_deform_attn_forward", "ms_deform_attn_backward"]
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/deformable_detr/__pycache__/__init__.cpython-313.pyc b/build/torch29-cxx11-cu130-aarch64-linux/deformable_detr/__pycache__/__init__.cpython-313.pyc
deleted file mode 100644
index bcbec634269105ed3f050790c341233863147939..0000000000000000000000000000000000000000
Binary files a/build/torch29-cxx11-cu130-aarch64-linux/deformable_detr/__pycache__/__init__.cpython-313.pyc and /dev/null differ
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/deformable_detr/__pycache__/_ops.cpython-313.pyc b/build/torch29-cxx11-cu130-aarch64-linux/deformable_detr/__pycache__/_ops.cpython-313.pyc
deleted file mode 100644
index d9b31300b9ba63446c10d5b68d0f83b08e7b0f86..0000000000000000000000000000000000000000
Binary files a/build/torch29-cxx11-cu130-aarch64-linux/deformable_detr/__pycache__/_ops.cpython-313.pyc and /dev/null differ
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/deformable_detr/__pycache__/layers.cpython-313.pyc b/build/torch29-cxx11-cu130-aarch64-linux/deformable_detr/__pycache__/layers.cpython-313.pyc
deleted file mode 100644
index ff6ce2776f285f4d38c98b0601243fd2230f543f..0000000000000000000000000000000000000000
Binary files a/build/torch29-cxx11-cu130-aarch64-linux/deformable_detr/__pycache__/layers.cpython-313.pyc and /dev/null differ
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/deformable_detr/_deformable_detr_320b408.abi3.so b/build/torch29-cxx11-cu130-aarch64-linux/deformable_detr/_deformable_detr_320b408.abi3.so
deleted file mode 100644
index fd13488427cdf7c7720d74a8e581c8ee296b5b60..0000000000000000000000000000000000000000
--- a/build/torch29-cxx11-cu130-aarch64-linux/deformable_detr/_deformable_detr_320b408.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6cc1595c80314a757e2d418bf2cf278ea2e1f8fadee0cd5440a753924ef87f0a
-size 9103096
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/deformable_detr/_ops.py b/build/torch29-cxx11-cu130-aarch64-linux/deformable_detr/_ops.py
deleted file mode 100644
index 155c52bfca54e55425b639314b289e668c5a6ec2..0000000000000000000000000000000000000000
--- a/build/torch29-cxx11-cu130-aarch64-linux/deformable_detr/_ops.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import torch
-from . import _deformable_detr_320b408
-ops = torch.ops._deformable_detr_320b408
-
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_deformable_detr_320b408::{op_name}"
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/deformable_detr/layers.py b/build/torch29-cxx11-cu130-aarch64-linux/deformable_detr/layers.py
deleted file mode 100644
index db94032dea3d445f27017f923ae80468e18d2d77..0000000000000000000000000000000000000000
--- a/build/torch29-cxx11-cu130-aarch64-linux/deformable_detr/layers.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from typing import List, Union, Tuple
-
-from torch import Tensor
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-import torch.nn as nn
-
-from ._ops import ops
-
-
-class MultiScaleDeformableAttentionFunction(Function):
-    @staticmethod
-    def forward(
-        context,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        context.im2col_step = im2col_step
-        output = ops.ms_deform_attn_forward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            context.im2col_step,
-        )
-        context.save_for_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        )
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(context, grad_output):
-        (
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        ) = context.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = ops.ms_deform_attn_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            grad_output,
-            context.im2col_step,
-        )
-
-        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
-
-
-class MultiScaleDeformableAttention(nn.Module):
-    def forward(
-        self,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_spatial_shapes_list: List[Tuple],
-        level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        return MultiScaleDeformableAttentionFunction.apply(
-            value,
-            value_spatial_shapes,
-            level_start_index,
-            sampling_locations,
-            attention_weights,
-            im2col_step,
-        )
-
-
-__all__ = ["MultiScaleDeformableAttention"]
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/__init__.py b/build/torch29-cxx11-cu130-x86_64-linux/__init__.py
deleted file mode 100644
index 33db73ca6e361af4707ba5bb5f55bf0e7c3005a4..0000000000000000000000000000000000000000
--- a/build/torch29-cxx11-cu130-x86_64-linux/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from typing import List
-import torch
-
-from ._ops import ops
-from . import layers
-
-
-def ms_deform_attn_backward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    grad_output: torch.Tensor,
-    im2col_step: int,
-) -> List[torch.Tensor]:
-    return ops.ms_deform_attn_backward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        grad_output,
-        im2col_step,
-    )
-
-
-def ms_deform_attn_forward(
-    value: torch.Tensor,
-    spatial_shapes: torch.Tensor,
-    level_start_index: torch.Tensor,
-    sampling_loc: torch.Tensor,
-    attn_weight: torch.Tensor,
-    im2col_step: int,
-) -> torch.Tensor:
-    return ops.ms_deform_attn_forward(
-        value,
-        spatial_shapes,
-        level_start_index,
-        sampling_loc,
-        attn_weight,
-        im2col_step,
-    )
-
-
-__all__ = ["layers", "ms_deform_attn_forward", "ms_deform_attn_backward"]
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/_deformable_detr_d7966ee.abi3.so b/build/torch29-cxx11-cu130-x86_64-linux/_deformable_detr_d7966ee.abi3.so
deleted file mode 100644
index be5b26d320893b17db668f8c30338e63f25f7ff5..0000000000000000000000000000000000000000
--- a/build/torch29-cxx11-cu130-x86_64-linux/_deformable_detr_d7966ee.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b54f4aea5fa0f031d09c8c035a4592d2a44a349416afc34f2d66e160a354967c
-size 9803056
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/_ops.py b/build/torch29-cxx11-cu130-x86_64-linux/_ops.py
deleted file mode 100644
index b498ff9a9fd0bc22a42440001932cf97a8a9e955..0000000000000000000000000000000000000000
--- a/build/torch29-cxx11-cu130-x86_64-linux/_ops.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import torch
-from . import _deformable_detr_d7966ee
-ops = torch.ops._deformable_detr_d7966ee
-
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_deformable_detr_d7966ee::{op_name}"
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/deformable_detr/__init__.py b/build/torch29-cxx11-cu130-x86_64-linux/deformable_detr/__init__.py
deleted file mode 100644
index 03dbc1afe1cf156661a2b1b22003cd5f599a0309..0000000000000000000000000000000000000000
--- a/build/torch29-cxx11-cu130-x86_64-linux/deformable_detr/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import ctypes
-import sys
-
-import importlib
-from pathlib import Path
-from types import ModuleType
-
-def _import_from_path(file_path: Path) -> ModuleType:
-    # We cannot use the module name as-is, after adding it to `sys.modules`,
-    # it would also be used for other imports. So, we make a module name that
-    # depends on the path for it to be unique using the hex-encoded hash of
-    # the path.
-    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
-    module_name = path_hash
-    spec = importlib.util.spec_from_file_location(module_name, file_path)
-    if spec is None:
-        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
-    module = importlib.util.module_from_spec(spec)
-    if module is None:
-        raise ImportError(f"Cannot load module {module_name} from spec")
-    sys.modules[module_name] = module
-    spec.loader.exec_module(module)  # type: ignore
-    return module
-
-
-globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/layers.py b/build/torch29-cxx11-cu130-x86_64-linux/layers.py
deleted file mode 100644
index db94032dea3d445f27017f923ae80468e18d2d77..0000000000000000000000000000000000000000
--- a/build/torch29-cxx11-cu130-x86_64-linux/layers.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from typing import List, Union, Tuple
-
-from torch import Tensor
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-import torch.nn as nn
-
-from ._ops import ops
-
-
-class MultiScaleDeformableAttentionFunction(Function):
-    @staticmethod
-    def forward(
-        context,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        context.im2col_step = im2col_step
-        output = ops.ms_deform_attn_forward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            context.im2col_step,
-        )
-        context.save_for_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        )
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(context, grad_output):
-        (
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        ) = context.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = ops.ms_deform_attn_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            grad_output,
-            context.im2col_step,
-        )
-
-        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
-
-
-class MultiScaleDeformableAttention(nn.Module):
-    def forward(
-        self,
-        value: Tensor,
-        value_spatial_shapes: Tensor,
-        value_spatial_shapes_list: List[Tuple],
-        level_start_index: Tensor,
-        sampling_locations: Tensor,
-        attention_weights: Tensor,
-        im2col_step: int,
-    ):
-        return MultiScaleDeformableAttentionFunction.apply(
-            value,
-            value_spatial_shapes,
-            level_start_index,
-            sampling_locations,
-            attention_weights,
-            im2col_step,
-        )
-
-
-__all__ = ["MultiScaleDeformableAttention"]
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/metadata.json b/build/torch29-cxx11-cu130-x86_64-linux/metadata.json
deleted file mode 100644
index 9cf5deed9898dce769f4cc73913d3530b92a0bd8..0000000000000000000000000000000000000000
--- a/build/torch29-cxx11-cu130-x86_64-linux/metadata.json
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-  "version": 1,
-  "python-depends": []
-}
\ No newline at end of file
diff --git a/deformable_detr/ms_deform_attn_cuda.cu b/deformable_detr/ms_deform_attn_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b9d6cc74146f22fa8cc0826ab1299a6c90c73014
--- /dev/null
+++ b/deformable_detr/ms_deform_attn_cuda.cu
@@ -0,0 +1,158 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include <vector>
+#include "deformable_detr/ms_deform_im2col_cuda.cuh"
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <torch/all.h>
+
+
+at::Tensor ms_deform_attn_cuda_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int64_t im2col_step)
+{
+    at::DeviceGuard guard(value.device());
+
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+
+    AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
+
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+
+    const int num_levels = spatial_shapes.size(0);
+
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+
+    const int im2col_step_ = std::min(batch, static_cast<int>(im2col_step));
+
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+    
+    auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
+
+    const int batch_n = im2col_step_;
+    auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto columns = output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, value.scalar_type(), "ms_deform_attn_forward_cuda", ([&] {
+            ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
+                value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
+                spatial_shapes.data_ptr<int64_t>(),
+                level_start_index.data_ptr<int64_t>(),
+                sampling_loc.data_ptr<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                attn_weight.data_ptr<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                columns.data_ptr<scalar_t>());
+
+        }));
+    }
+
+    output = output.view({batch, num_query, num_heads*channels});
+
+    return output;
+}
+
+
+std::vector<at::Tensor> ms_deform_attn_cuda_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int64_t im2col_step)
+{
+    at::DeviceGuard guard(value.device());
+
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+    AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
+
+    AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
+    AT_ASSERTM(grad_output.is_cuda(), "grad_output must be a CUDA tensor");
+
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+
+    const int num_levels = spatial_shapes.size(0);
+
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+
+    const int im2col_step_ = std::min(batch, static_cast<int>(im2col_step));
+
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+
+    auto grad_value = at::zeros_like(value);
+    auto grad_sampling_loc = at::zeros_like(sampling_loc);
+    auto grad_attn_weight = at::zeros_like(attn_weight);
+
+    const int batch_n = im2col_step_;
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto grad_output_g = grad_output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, value.scalar_type(), "ms_deform_attn_backward_cuda", ([&] {
+            ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
+                                    grad_output_g.data_ptr<scalar_t>(),
+                                    value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
+                                    spatial_shapes.data_ptr<int64_t>(),
+                                    level_start_index.data_ptr<int64_t>(),
+                                    sampling_loc.data_ptr<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    attn_weight.data_ptr<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                                    batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                                    grad_value.data_ptr<scalar_t>() +  n * im2col_step_ * per_value_size,
+                                    grad_sampling_loc.data_ptr<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    grad_attn_weight.data_ptr<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
+
+        }));
+    }
+
+    return {
+        grad_value, grad_sampling_loc, grad_attn_weight
+    };
+}
diff --git a/deformable_detr/ms_deform_attn_cuda.cuh b/deformable_detr/ms_deform_attn_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..20ae6892e4b9881578a72aae27ddc4ec9f68ae1c
--- /dev/null
+++ b/deformable_detr/ms_deform_attn_cuda.cuh
@@ -0,0 +1,1467 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include <vector>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <THC/THCAtomics.cuh>
+
+#define CUDA_KERNEL_LOOP(i, n)                          \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
+      i < (n);                                          \
+      i += blockDim.x * gridDim.x)
+
+
+at::Tensor ms_deform_attn_cuda_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+
+    AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
+
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+
+    const int num_levels = spatial_shapes.size(0);
+
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+
+    const int im2col_step_ = std::min(batch, im2col_step);
+
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+    
+    auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
+
+    const int batch_n = im2col_step_;
+    auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto columns = output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, value.scalar_type(), "ms_deform_attn_forward_cuda", ([&] {
+            ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
+                value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
+                spatial_shapes.data_ptr<int64_t>(),
+                level_start_index.data_ptr<int64_t>(),
+                sampling_loc.data_ptr<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                attn_weight.data_ptr<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                columns.data_ptr<scalar_t>());
+
+        }));
+    }
+
+    output = output.view({batch, num_query, num_heads*channels});
+
+    return output;
+}
+
+
+std::vector<at::Tensor> ms_deform_attn_cuda_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+    AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
+
+    AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
+    AT_ASSERTM(grad_output.is_cuda(), "grad_output must be a CUDA tensor");
+
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+
+    const int num_levels = spatial_shapes.size(0);
+
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+
+    const int im2col_step_ = std::min(batch, im2col_step);
+
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+
+    auto grad_value = at::zeros_like(value);
+    auto grad_sampling_loc = at::zeros_like(sampling_loc);
+    auto grad_attn_weight = at::zeros_like(attn_weight);
+
+    const int batch_n = im2col_step_;
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto grad_output_g = grad_output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, value.scalar_type(), "ms_deform_attn_backward_cuda", ([&] {
+            ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
+                                    grad_output_g.data_ptr<scalar_t>(),
+                                    value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
+                                    spatial_shapes.data_ptr<int64_t>(),
+                                    level_start_index.data_ptr<int64_t>(),
+                                    sampling_loc.data_ptr<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    attn_weight.data_ptr<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                                    batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                                    grad_value.data_ptr<scalar_t>() +  n * im2col_step_ * per_value_size,
+                                    grad_sampling_loc.data_ptr<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    grad_attn_weight.data_ptr<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
+
+        }));
+    }
+
+    return {
+        grad_value, grad_sampling_loc, grad_attn_weight
+    };
+}
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N, const int num_threads)
+{
+  return (N + num_threads - 1) / num_threads;
+}
+
+
+template <typename scalar_t>
+__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+  }
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
+                                                   const scalar_t &top_grad,
+                                                   const scalar_t &attn_weight,
+                                                   scalar_t* &grad_value, 
+                                                   scalar_t* grad_sampling_loc,
+                                                   scalar_t* grad_attn_weight)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value+ptr1, w1*top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value+ptr2, w2*top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value+ptr3, w3*top_grad_value); 
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value+ptr4, w4*top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  *grad_attn_weight = top_grad * val;
+  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
+  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
+}
+
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
+                                                   const scalar_t &top_grad,
+                                                   const scalar_t &attn_weight,
+                                                   scalar_t* &grad_value, 
+                                                   scalar_t* grad_sampling_loc,
+                                                   scalar_t* grad_attn_weight)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value+ptr1, w1*top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value+ptr2, w2*top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value+ptr3, w3*top_grad_value); 
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value+ptr4, w4*top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  atomicAdd(grad_attn_weight, top_grad * val); 
+  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
+  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_im2col_gpu_kernel(const int n,
+                                                const scalar_t *data_value, 
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *data_col)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    [[maybe_unused]] const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    scalar_t *data_col_ptr = data_col + index;
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    scalar_t col = 0;
+    
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;
+        }
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+      }
+    }
+    *data_col_ptr = col;
+  }
+}
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    [[maybe_unused]] const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+        if (tid == 0)
+        {
+          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
+          int sid=2;
+          for (unsigned int tid = 1; tid < blockSize; ++tid)
+          {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+          
+          
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    [[maybe_unused]] const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+
+        for (unsigned int s=blockSize/2; s>0; s>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        { 
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    [[maybe_unused]] const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+        if (tid == 0)
+        {
+          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
+          int sid=2;
+          for (unsigned int tid = 1; tid < blockDim.x; ++tid)
+          {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+          
+          
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    [[maybe_unused]] const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+
+        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre)
+            {
+              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            } 
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    [[maybe_unused]] const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+
+        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre)
+            {
+              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        {
+          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
+          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
+          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    [[maybe_unused]] const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear_gm(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            grad_sampling_loc, grad_attn_weight);
+        }
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+void ms_deformable_im2col_cuda(cudaStream_t stream,
+                              const scalar_t* data_value,
+                              const int64_t* data_spatial_shapes, 
+                              const int64_t* data_level_start_index, 
+                              const scalar_t* data_sampling_loc,
+                              const scalar_t* data_attn_weight,
+                              const int batch_size,
+                              const int spatial_size, 
+                              const int num_heads, 
+                              const int channels, 
+                              const int num_levels, 
+                              const int num_query,
+                              const int num_point,
+                              scalar_t* data_col)
+{
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  const int num_threads = CUDA_NUM_THREADS;
+  ms_deformable_im2col_gpu_kernel<scalar_t>
+      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+          0, stream>>>(
+      num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, 
+      batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
+  
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+  }
+
+}
+
+template <typename scalar_t>
+void ms_deformable_col2im_cuda(cudaStream_t stream,
+                              const scalar_t* grad_col,
+                              const scalar_t* data_value,
+                              const int64_t * data_spatial_shapes,
+                              const int64_t * data_level_start_index,
+                              const scalar_t * data_sampling_loc,
+                              const scalar_t * data_attn_weight,
+                              const int batch_size, 
+                              const int spatial_size, 
+                              const int num_heads,
+                              const int channels, 
+                              const int num_levels,
+                              const int num_query,
+                              const int num_point, 
+                              scalar_t* grad_value,
+                              scalar_t* grad_sampling_loc,
+                              scalar_t* grad_attn_weight)
+{
+  const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels;
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  if (channels > 1024)
+  {
+    if ((channels & 1023) == 0)
+    {
+      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels, 
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index, 
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size, 
+                        spatial_size, 
+                        num_heads,
+                        channels, 
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+    }
+    else
+    {
+      ms_deformable_col2im_gpu_kernel_gm<scalar_t>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+    }
+  }
+  else{
+    switch(channels)
+    {
+      case 1:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 1>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 2:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 2>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 4:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 4>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 8:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 8>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 16:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 16>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 32:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 32>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 64:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 64>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 128:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 128>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 256:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 256>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 512:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 512>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 1024:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 1024>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      default:
+        if (channels < 64)
+        {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels, 
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index, 
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size, 
+                        spatial_size, 
+                        num_heads,
+                        channels, 
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+        }
+        else
+        {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels, 
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index, 
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size, 
+                        spatial_size, 
+                        num_heads,
+                        channels, 
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+        }
+    }
+  }
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+  }
+
+}
diff --git a/deformable_detr/ms_deform_attn_cuda.h b/deformable_detr/ms_deform_attn_cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..5bf596397916b099eb317114dc929720e0bed695
--- /dev/null
+++ b/deformable_detr/ms_deform_attn_cuda.h
@@ -0,0 +1,46 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#pragma once
+#include <torch/torch.h>
+
+at::Tensor ms_deform_attn_cuda_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step);
+
+at::Tensor ms_deform_attn_cuda_forward_bf16(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step);
+
+std::vector<at::Tensor> ms_deform_attn_cuda_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step);
+
+std::vector<at::Tensor> ms_deform_attn_cuda_backward_bf16(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step);
diff --git a/deformable_detr/ms_deform_im2col_cuda.cuh b/deformable_detr/ms_deform_im2col_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4fb544bf791ddae79238924df591b7a33f3cccdd
--- /dev/null
+++ b/deformable_detr/ms_deform_im2col_cuda.cuh
@@ -0,0 +1,1327 @@
+/*!
+**************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************
+* Modified from DCN (https://github.com/msracver/Deformable-ConvNets)
+* Copyright (c) 2018 Microsoft
+**************************************************************************
+*/
+
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <THC/THCAtomics.cuh>
+
+#define CUDA_KERNEL_LOOP(i, n)                          \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
+      i < (n);                                          \
+      i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N, const int num_threads)
+{
+  return (N + num_threads - 1) / num_threads;
+}
+
+
+template <typename scalar_t>
+__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+  }
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
+                                                   const scalar_t &top_grad,
+                                                   const scalar_t &attn_weight,
+                                                   scalar_t* &grad_value, 
+                                                   scalar_t* grad_sampling_loc,
+                                                   scalar_t* grad_attn_weight)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value+ptr1, w1*top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value+ptr2, w2*top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value+ptr3, w3*top_grad_value); 
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value+ptr4, w4*top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  *grad_attn_weight = top_grad * val;
+  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
+  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
+}
+
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
+                                                   const scalar_t &top_grad,
+                                                   const scalar_t &attn_weight,
+                                                   scalar_t* &grad_value, 
+                                                   scalar_t* grad_sampling_loc,
+                                                   scalar_t* grad_attn_weight)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value+ptr1, w1*top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value+ptr2, w2*top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value+ptr3, w3*top_grad_value); 
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value+ptr4, w4*top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  atomicAdd(grad_attn_weight, top_grad * val); 
+  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
+  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_im2col_gpu_kernel(const int n,
+                                                const scalar_t *data_value, 
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *data_col)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    [[maybe_unused]] const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    scalar_t *data_col_ptr = data_col + index;
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    scalar_t col = 0;
+    
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;
+        }
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+      }
+    }
+    *data_col_ptr = col;
+  }
+}
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    [[maybe_unused]] const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+        if (tid == 0)
+        {
+          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
+          int sid=2;
+          for (unsigned int tid = 1; tid < blockSize; ++tid)
+          {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+          
+          
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    [[maybe_unused]] const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+
+        for (unsigned int s=blockSize/2; s>0; s>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        { 
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    [[maybe_unused]] const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+        if (tid == 0)
+        {
+          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
+          int sid=2;
+          for (unsigned int tid = 1; tid < blockDim.x; ++tid)
+          {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+          
+          
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    [[maybe_unused]] const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+
+        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre)
+            {
+              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            } 
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    [[maybe_unused]] const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+
+        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre)
+            {
+              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        {
+          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
+          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
+          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    [[maybe_unused]] const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear_gm(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            grad_sampling_loc, grad_attn_weight);
+        }
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+void ms_deformable_im2col_cuda(cudaStream_t stream,
+                              const scalar_t* data_value,
+                              const int64_t* data_spatial_shapes, 
+                              const int64_t* data_level_start_index, 
+                              const scalar_t* data_sampling_loc,
+                              const scalar_t* data_attn_weight,
+                              const int batch_size,
+                              const int spatial_size, 
+                              const int num_heads, 
+                              const int channels, 
+                              const int num_levels, 
+                              const int num_query,
+                              const int num_point,
+                              scalar_t* data_col)
+{
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  const int num_threads = CUDA_NUM_THREADS;
+  ms_deformable_im2col_gpu_kernel<scalar_t>
+      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+          0, stream>>>(
+      num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, 
+      batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
+  
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+  }
+
+}
+
+template <typename scalar_t>
+void ms_deformable_col2im_cuda(cudaStream_t stream,
+                              const scalar_t* grad_col,
+                              const scalar_t* data_value,
+                              const int64_t * data_spatial_shapes,
+                              const int64_t * data_level_start_index,
+                              const scalar_t * data_sampling_loc,
+                              const scalar_t * data_attn_weight,
+                              const int batch_size, 
+                              const int spatial_size, 
+                              const int num_heads,
+                              const int channels, 
+                              const int num_levels,
+                              const int num_query,
+                              const int num_point, 
+                              scalar_t* grad_value,
+                              scalar_t* grad_sampling_loc,
+                              scalar_t* grad_attn_weight)
+{
+  const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels;
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  if (channels > 1024)
+  {
+    if ((channels & 1023) == 0)
+    {
+      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels, 
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index, 
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size, 
+                        spatial_size, 
+                        num_heads,
+                        channels, 
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+    }
+    else
+    {
+      ms_deformable_col2im_gpu_kernel_gm<scalar_t>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+    }
+  }
+  else{
+    switch(channels)
+    {
+      case 1:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 1>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 2:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 2>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 4:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 4>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 8:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 8>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 16:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 16>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 32:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 32>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 64:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 64>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 128:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 128>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 256:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 256>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 512:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 512>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 1024:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 1024>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      default:
+        if (channels < 64)
+        {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels, 
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index, 
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size, 
+                        spatial_size, 
+                        num_heads,
+                        channels, 
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+        }
+        else
+        {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels, 
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index, 
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size, 
+                        spatial_size, 
+                        num_heads,
+                        channels, 
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+        }
+    }
+  }
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+  }
+
+}
diff --git a/flake.nix b/flake.nix
new file mode 100644
index 0000000000000000000000000000000000000000..6fe55afb5a309a71720974748f2796b5c26a2f05
--- /dev/null
+++ b/flake.nix
@@ -0,0 +1,14 @@
+{
+  description = "Flake for deformable_detr kernels";
+
+  inputs = {
+    kernel-builder.url = "git+ssh://git@github.com/huggingface/kernel-builder";
+  };
+
+  outputs =
+    {
+      self,
+      kernel-builder,
+    }:
+    kernel-builder.lib.genFlakeOutputs ./.;
+}
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/__init__.py b/torch-ext/deformable_detr/__init__.py
similarity index 100%
rename from build/torch210-cxx11-cu126-x86_64-linux/__init__.py
rename to torch-ext/deformable_detr/__init__.py
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/layers.py b/torch-ext/deformable_detr/layers.py
similarity index 100%
rename from build/torch210-cxx11-cu126-x86_64-linux/layers.py
rename to torch-ext/deformable_detr/layers.py
diff --git a/torch-ext/ms_deform_attn_cpu.cpp b/torch-ext/ms_deform_attn_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..388a73d22d4c9b561e2a887b50a1897b8cf2def9
--- /dev/null
+++ b/torch-ext/ms_deform_attn_cpu.cpp
@@ -0,0 +1,40 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include <vector>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+
+at::Tensor
+ms_deform_attn_cpu_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    AT_ERROR("Not implement on cpu");
+}
+
+std::vector<at::Tensor>
+ms_deform_attn_cpu_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+    AT_ERROR("Not implement on cpu");
+}
diff --git a/torch-ext/ms_deform_attn_cpu.h b/torch-ext/ms_deform_attn_cpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..7eac8c8bcd1bf529bb9c13d54d2d4215c9e4c89f
--- /dev/null
+++ b/torch-ext/ms_deform_attn_cpu.h
@@ -0,0 +1,32 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor
+ms_deform_attn_cpu_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step);
+
+std::vector<at::Tensor>
+ms_deform_attn_cpu_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step);
+
diff --git a/torch-ext/torch_binding.cpp b/torch-ext/torch_binding.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fb462761e30f07241c192fd610ac307ae3b76af2
--- /dev/null
+++ b/torch-ext/torch_binding.cpp
@@ -0,0 +1,19 @@
+#include <torch/library.h>
+
+#include "registration.h"
+#include "torch_binding.h"
+
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+  ops.def("ms_deform_attn_forward(Tensor value, Tensor spatial_shapes,"
+          "                       Tensor level_start_index, Tensor sampling_loc,"
+          "                       Tensor attn_weight, int im2col_step) -> Tensor");
+  ops.impl("ms_deform_attn_forward", torch::kCUDA, &ms_deform_attn_cuda_forward);
+
+  ops.def("ms_deform_attn_backward(Tensor value, Tensor spatial_shapes,"
+          "                       Tensor level_start_index, Tensor sampling_loc,"
+          "                       Tensor attn_weight, Tensor grad_output,"
+          "                       int im2col_step) -> Tensor[]");
+  ops.impl("ms_deform_attn_backward", torch::kCUDA, &ms_deform_attn_cuda_backward);
+}
+
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/torch-ext/torch_binding.h b/torch-ext/torch_binding.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6e0d303971fc6641d03f610ea681369c82ff90f
--- /dev/null
+++ b/torch-ext/torch_binding.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <torch/torch.h>
+
+at::Tensor ms_deform_attn_cuda_forward(const at::Tensor &value,
+                                       const at::Tensor &spatial_shapes,
+                                       const at::Tensor &level_start_index,
+                                       const at::Tensor &sampling_loc,
+                                       const at::Tensor &attn_weight,
+                                       const int64_t im2col_step);
+
+std::vector<at::Tensor> ms_deform_attn_cuda_backward(
+    const at::Tensor &value, const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index, const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight, const at::Tensor &grad_output,
+    const int64_t im2col_step);