# ------------------------------------------------------------------------ # RF-DETR # Copyright (c) 2025 Roboflow. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------ # Modified from LW-DETR (https://github.com/Atten4Vis/LW-DETR) # Copyright (c) 2024 Baidu. All Rights Reserved. # ------------------------------------------------------------------------------------------------ # Modified from Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ """ ms_deform_attn_func """ from __future__ import absolute_import from __future__ import print_function from __future__ import division import torch import torch.nn.functional as F from torch.autograd import Function from torch.autograd.function import once_differentiable def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): """"for debug and test only, need to use cuda version instead """ # B, n_heads, head_dim, N B, n_heads, head_dim, _ = value.shape _, Len_q, n_heads, L, P, _ = sampling_locations.shape value_list = value.split([H * W for H, W in value_spatial_shapes], dim=3) sampling_grids = 2 * sampling_locations - 1 sampling_value_list = [] for lid_, (H, W) in enumerate(value_spatial_shapes): # B, n_heads, head_dim, H, W value_l_ = value_list[lid_].view(B * n_heads, head_dim, H, W) # B, Len_q, n_heads, P, 2 -> B, n_heads, Len_q, P, 2 -> B*n_heads, Len_q, P, 2 sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) # B*n_heads, head_dim, Len_q, P sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, mode='bilinear', padding_mode='zeros', align_corners=False) sampling_value_list.append(sampling_value_l_) # (B, Len_q, n_heads, L * P) -> (B, n_heads, Len_q, L, P) -> (B*n_heads, 1, Len_q, L*P) attention_weights = attention_weights.transpose(1, 2).reshape(B * n_heads, 1, Len_q, L * P) # B*n_heads, head_dim, Len_q, L*P sampling_value_list = torch.stack(sampling_value_list, dim=-2).flatten(-2) output = (sampling_value_list * attention_weights).sum(-1).view(B, n_heads * head_dim, Len_q) return output.transpose(1, 2).contiguous()