Upload folder using huggingface_hub

Browse files

Files changed (10) hide show

ASDA/model/__pycache__/model.cpython-39.pyc +0 -0
ASDA/model/__pycache__/model_sbert_gref.cpython-39.pyc +0 -0
ASDA/model/__pycache__/modules.cpython-39.pyc +0 -0
ASDA/model/__pycache__/position_encoding.cpython-39.pyc +0 -0
ASDA/model/__pycache__/transformer.cpython-39.pyc +0 -0
ASDA/model/model.py +466 -0
ASDA/model/model_sbert_gref.py +488 -0
ASDA/model/modules.py +391 -0
ASDA/model/position_encoding.py +47 -0
ASDA/model/transformer.py +251 -0

ASDA/model/__pycache__/model.cpython-39.pyc ADDED Viewed

Binary file (12.6 kB). View file

ASDA/model/__pycache__/model_sbert_gref.cpython-39.pyc ADDED Viewed

Binary file (13 kB). View file

ASDA/model/__pycache__/modules.cpython-39.pyc ADDED Viewed

Binary file (8.28 kB). View file

ASDA/model/__pycache__/position_encoding.cpython-39.pyc ADDED Viewed

Binary file (2 kB). View file

ASDA/model/__pycache__/transformer.cpython-39.pyc ADDED Viewed

Binary file (8.29 kB). View file

ASDA/model/model.py ADDED Viewed

	@@ -0,0 +1,466 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .modules import ConvBatchNormReLU, SFA
+from .modules import *
+from .position_encoding import *
+import clip
+import math
+import sys
+sys.path.append('../')
+from utils.utils import *
+class Simple_fusion(nn.Module):
+    def __init__(self, visual_dim=1024, text_dim=768, proj_dim=1024, jemb_drop_out=0.1, leaky=True):
+        super(Simple_fusion, self).__init__()
+        self.proj_dim = proj_dim
+        self.mapping_visu = ConvBatchNormReLU(visual_dim, proj_dim, 1, 1, 0, 1, leaky=leaky)
+        self.lang_attn = nn.Sequential(
+            nn.Linear(text_dim, text_dim),
+            nn.Tanh(),
+            nn.Dropout(jemb_drop_out),
+            nn.Softmax(dim=1))
+        self.lang_proj = nn.Sequential(
+            nn.Linear(text_dim, proj_dim),
+            nn.BatchNorm1d(proj_dim),
+            nn.LeakyReLU(0.1))
+        self.fusion = nn.Sequential(
+            nn.BatchNorm2d(proj_dim),
+            nn.LeakyReLU(0.1))
+    def forward(self, visual_feat, lang_feat):
+        # visual proj
+        visual_feat_proj = self.mapping_visu(visual_feat) # [bt, 1024, 13, 13]
+        """
+        # lang attn
+        lang_feat_attn = self.lang_attn(lang_feat) #[bt, 15, 768]
+        lang_feat_new = lang_feat * lang_feat_attn
+        lang_feat_new = lang_feat_new.sum(dim=1) #[bt, 768]
+        """
+        lang_feat = lang_feat.squeeze(1)
+        # lang proj
+        #lang_feat_new = self.lang_proj(lang_feat_new) #[bt, 1024]
+        lang_feat_new = self.lang_proj(lang_feat) #[bt, 1024]
+        # fusion
+        h, w = visual_feat.shape[-2], visual_feat.shape[-1]
+        lang_feat_new_tile = lang_feat_new.view(-1, self.proj_dim, 1, 1).repeat(1, 1, h, w) # [bt, 1024, 13, 13]
+        fusion_feat = lang_feat_new_tile * visual_feat_proj
+        fusion_feat = self.fusion(fusion_feat)
+        return fusion_feat
+class up_proj_cat_proj(nn.Module):
+    def __init__(self, input_1, input_2, do=512, leaky=True):
+        super(up_proj_cat_proj, self).__init__()
+        self.proj1 = ConvBatchNormReLU(input_2, input_2, 1, 1, 0, 1, leaky=leaky)
+        self.proj2 = ConvBatchNormReLU(input_1+input_2, do, 1, 1, 0, 1, leaky=leaky)
+    def forward(self, x, y):
+        x = F.interpolate(x, scale_factor=2, mode='nearest')
+        y = self.proj1(y)
+        out = torch.cat([x,y], dim=1)
+        out = self.proj2(out)
+        return out
+class pool_proj_cat_proj(nn.Module):
+    def __init__(self, input_1, input_2, do=512, leaky=True):
+        super(pool_proj_cat_proj, self).__init__()
+        self.downsample = nn.AvgPool2d(2, 2)
+        self.proj1 = ConvBatchNormReLU(input_2, do // 2,    1, 1, 0, 1, leaky=leaky)
+        self.proj2 = ConvBatchNormReLU(do // 2, do,         3, 1, 1, 1, leaky=leaky)
+        self.proj3 = ConvBatchNormReLU(input_1+do, do,      1, 1, 0, 1, leaky=leaky)
+    def forward(self, x, y):
+        y = self.downsample(y)
+        y = self.proj1(y)
+        y = self.proj2(y)
+        output = self.proj3(torch.cat([x,y], dim=1))
+        return output
+class proj_cat_proj(nn.Module):
+    def __init__(self, input_1, input_2, do=512, leaky=True):
+        super(proj_cat_proj, self).__init__()
+        self.proj1 = ConvBatchNormReLU(input_2, input_2,        1, 1, 0, 1, leaky=leaky)
+        self.proj2 = ConvBatchNormReLU(input_1 + input_2, do,   1, 1, 0, 1, leaky=leaky)
+    def forward(self, x, y):
+        y = self.proj1(y)
+        out = torch.cat([x, y], dim=1)
+        out = self.proj2(out)
+        return out
+class proj_cat(nn.Module):
+    def __init__(self, input_1, input_2, do=512, leaky=True):
+        super(proj_cat, self).__init__()
+        self.proj1 = ConvBatchNormReLU(input_1, do // 2,    1, 1, 0, 1, leaky=leaky)
+        self.proj2 = ConvBatchNormReLU(do // 2, do,         3, 1, 1, 1, leaky=leaky)
+    def forward(self, x, y):
+        x = self.proj1(x)
+        x = self.proj2(x)
+        output = torch.cat([x,y], dim=1)
+        return output
+class mask_decoder(nn.Module):
+    def __init__(self, input_1, seg_out_stride=2, leaky=True):
+        super(mask_decoder, self).__init__()
+        self.proj1 = ConvBatchNormReLU(input_1, input_1//2, 3, 1, 1, 1, leaky=leaky)
+        self.proj2 = ConvBatchNormReLU(input_1//2, input_1//2, 3, 1, 1, 1, leaky=leaky)
+        self.proj3 = ConvBatchNormReLU(input_1//2, input_1//2, 3, 1, 1, 1, leaky=leaky)
+        self.proj4 = ConvBatchNormReLU(input_1//2, input_1//2, 3, 1, 1, 1, leaky=leaky)
+        self.proj5 = ConvBatchNormReLU(input_1//2, input_1//2, 3, 1, 1, 1, leaky=leaky)
+        #self.proj = nn.Conv2d(input_1, 1, 3, 1, 1, 1)
+        self.proj = nn.Conv2d(input_1//2, 32, 3, 1, 1, 1)
+    def forward(self, x, seg_out_stride):
+        x = self.proj1(x)
+        x = self.proj2(x)
+        if seg_out_stride <= 8:
+            x = F.interpolate(x, scale_factor=2, mode='nearest')
+            x = self.proj3(x)
+        if seg_out_stride <= 4:
+            x = F.interpolate(x, scale_factor=2, mode='nearest')
+            x = self.proj4(x)
+        if seg_out_stride <= 2:
+            x = F.interpolate(x, scale_factor=2, mode='nearest')
+            x = self.proj5(x)
+        x = self.proj(x)
+        return x
+# class FeatureSelector(nn.Module):
+#     def __init__(self, img_feature_dim, text_feature_dim, output_dim):
+#         super(FeatureSelector, self).__init__()
+#         # 使用nn.Sequential来简化MLP的构建
+#         self.mlp = nn.Sequential(
+#             nn.Linear(img_feature_dim * 3 + text_feature_dim * 3, 1024),
+#             nn.ReLU(),
+#             nn.Linear(1024, 256),
+#             nn.ReLU(),
+#             nn.Linear(256, output_dim)
+#         )
+#     def forward(self, img_features, text_feature):
+#         # 将图像特征和文本特征拼接
+#         combined_features = torch.cat(img_features + text_feature, dim=1) #
+#         # 通过MLP得到输出得分
+#         scores = self.mlp(combined_features)
+#         return scores
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class ResidualAttentionblk(nn.Module):
+    def __init__(self, clip_module):
+        super().__init__()
+        self.clip_module = clip_module
+        self.selected_tokens = int(676 * 0.8)
+        #self.norm = nn.LayerNorm(768)
+    def forward(self, x: torch.Tensor, attn_mask: torch.Tensor = None, lang_tokens=None, index=0):
+        if lang_tokens is None:
+            x = x + self.clip_module.attention(self.clip_module.ln_1(x))
+        else:
+            #if index >= 4 and index <= 7:
+            #    self.selected_tokens = int (676 * 0.8)
+            #elif index>=8 and index <=11:
+            #    self.selected_tokens = int (676 * 0.5)
+            #print(index)
+            #print(self.selected_tokens)
+            N, B, C = x.shape   # N x B x C
+            cls_x = x[:1, :, :] # 1 x B x C
+            x = x[1:, :, :]     # M x B x C
+            ###img_cls text_cls
+            #x = torch.mul(x, cls_x)
+            #x = self.norm(x.reshape((N-1)*B, C))
+            #x = x.reshape(N-1, B, C)
+            ### text eos token
+            #score = torch.bmm(x.transpose(0,1), lang_tokens).squeeze(-1)
+            ### text features mean
+            score = torch.bmm(x.transpose(0, 1), lang_tokens.permute(1, 2, 0)).mean(dim=-1)   # B x N
+            score = score.transpose(0, 1)   # N x B
+            sorted_scores, sorted_indices = torch.sort(score, descending=True, dim=0)
+            # high_mask = sorted_scores > sorted_scores[self.selected_tokens:self.selected_tokens+1, :]
+            high_mask = torch.ones_like(sorted_scores)
+            for i in range(B):
+                high_mask[sorted_indices[self.selected_tokens:, i], i] = 0
+            high_mask = high_mask > 0.5
+            delta_x = x[high_mask].reshape(-1, B, C)        # M x B x C
+            low_x = x[~high_mask].reshape(-1, B, C)         # N-M x B x C
+            low_score = score[~high_mask].reshape(-1, B, 1) # N-M x B x 1
+            low_x = low_x * torch.softmax(low_score, dim=0) # N-M x B x C
+            low_x = low_x.sum(dim=0, keepdim=True)          # 1 x B x C
+            delta_x = torch.cat([cls_x, delta_x, low_x], dim=0) # M+1 x B x C
+            delta_x = self.clip_module.attention(self.clip_module.ln_1(delta_x))
+            # for i in range(B):
+            #     x[high_mask[:, i], i, :] += delta_x[1:-1, i, :]
+            #     x[~high_mask[:, i], i, :] += delta_x[-1:, i, :]
+            #     cls_x[:, i] += delta_x[:1, i, :]
+            temple = torch.zeros_like(x).type(delta_x.type())
+            temple[high_mask] = delta_x[1:-1, :, :].reshape(-1, C)
+            temple[~high_mask] = delta_x[-1:, :, :].reshape(-1, 1, C).repeat(1, 676 - self.selected_tokens, 1).reshape(-1, C)
+            x = x + temple
+            cls_x = cls_x + delta_x[:1, :, :]
+            x = torch.cat([cls_x, x], dim=0)
+        x = x + self.clip_module.mlp(self.clip_module.ln_2(x))
+        return x
+class Model(nn.Module):
+    def __init__(self, clip_model='RN50', tunelang=False, fusion_dim=2048, num_query=16, do=512, leaky=True, length=17):
+        super(Model, self).__init__()
+        self.tunelang = tunelang
+        self.length = length
+        ## Init Encoders
+        clip_models = clip.load(clip_model, jit=False, device=torch.device("cpu"))[0].cuda()
+        self.visumodel = clip_models.visual
+        self.visu_dim = 768
+        self.cut_list = []
+        self.visu_resblocks = nn.ModuleList([ResidualAttentionblk(self.visumodel.transformer.resblocks[i]) for i in range(12)])
+        self.visu_proj = nn.ModuleList([nn.Linear(do, self.visu_dim) for _ in range(len(self.cut_list))])
+        self.positional_embedding = nn.Parameter(torch.FloatTensor(1, 26 ** 2 + 1, 768))
+        v = self.resize_pos_embed(self.visumodel.positional_embedding.data.unsqueeze(0), self.positional_embedding, 26, 26)
+        self.positional_embedding.data.copy_(v)
+        self.textmodel = clip_models.transformer
+        self.textmodel_token_embedding = clip_models.token_embedding
+        self.textmodel_pos_embed = nn.Parameter(clip_models.positional_embedding[:self.length, :].unsqueeze(0))
+        self.textmodel_ln_final = clip_models.ln_final
+        self.textdim = self.textmodel_pos_embed.shape[-1]
+        for module in self.textmodel.resblocks:
+            module.attn_mask = self.build_attention_mask()
+        # vis select
+        self.vis_select = nn.Linear(self.visu_dim, do, bias=False)
+        ## Fusion
+        # fusion with x12
+        self.fusion = Simple_fusion(visual_dim=self.visu_dim, text_dim=self.textdim, proj_dim=fusion_dim)
+        # fusion with x6
+        self.up_proj_cat_proj_1 = proj_cat_proj(input_1=fusion_dim, input_2=self.visu_dim, do=fusion_dim)
+        self.pool_proj_cat_proj_2 = proj_cat_proj(input_1=fusion_dim, input_2=self.visu_dim, do=do)
+        # fusion with x9
+        self.proj_cat = proj_cat(input_1=fusion_dim, input_2=do, do=do)
+        self.up_proj_cat_2 = proj_cat_proj(input_1=fusion_dim, input_2=do * 2, do=do)
+        self.proj_0 = ConvBatchNormReLU(do, do, 1, 1, 0, 1, leaky=leaky)
+        self.fpn = SFA(in_channels=self.visu_dim, out_channels=do)
+        ## Align dim
+        f_dim = 512
+        self.fc_2 = nn.Linear(f_dim, f_dim, bias=False)
+        self.norm1 = nn.LayerNorm(f_dim)
+        self.norm2 = nn.LayerNorm(f_dim)
+        # visual branch
+        self.pos_embedding = PositionEmbeddingSine(f_dim)
+        encoder_layer = TransformerEncoderLayer(f_dim, nhead=8, dim_feedforward=f_dim,
+                                                dropout=0.1, activation='relu', normalize_before=False)
+        self.encoder = TransformerEncoder(encoder_layer, num_layers=2, norm=nn.LayerNorm(f_dim))
+        ## Decoder
+        self.mask_decoder = mask_decoder(f_dim, seg_out_stride=2)
+        # text branch
+        ## coef
+        self.lang_tf_enc = lang_tf_enc(do, do, do, head_num=8)
+        self.proj1 = ConvBatchNormReLU(do, do, 3, 1, 1, 1, leaky=leaky)
+        self.proj2 = ConvBatchNormReLU(do, do, 3, 1, 1, 1, leaky=leaky)
+        self.proj3 = nn.Conv2d(do, 32, 3, 1, 1, 1)
+        self.projout = nn.Linear(26*26*32, 32, bias=False)
+        self.feature_selector_l = nn.Linear(do, 1, bias=True)
+        self.feature_selector_m = nn.Linear(do, 1, bias=True)
+    def resize_pos_embed(self, posemb, posemb_new, hight, width):
+        ntok_new = posemb_new.shape[1]
+        posemb_token, posemb_grid = posemb[:, :1], posemb[0, 1:]
+        ntok_new -= 1
+        gs_old = int(math.sqrt(len(posemb_grid)))
+        print('Resized position embedding from size:{} to size: {} with height:{} width: {}'.format(posemb.shape, posemb_new.shape, hight, width))
+        posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
+        posemb_grid = F.interpolate(posemb_grid, size=(hight, width), mode='bilinear')
+        posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, hight * width, -1)
+        posemb = torch.cat([posemb_token, posemb_grid], dim=1)
+        return posemb
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.length, self.length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    def forward(self, image, word_id, word_mask):
+        ## Visual Module
+        batch_size = image.size(0)
+        # Extract features from vision
+        x = self.visumodel.conv1(image)
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x = torch.cat([self.visumodel.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.visumodel.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        raw_fword = self.textmodel_token_embedding(word_id).squeeze(1)
+        raw_fword = raw_fword + self.textmodel_pos_embed
+        raw_fword = raw_fword.permute(1, 0, 2) # NLD -> LND
+        visu_list_l = []
+        visu_list_m = []
+        scores_l = []
+        scores_m = []
+        for i, [blk_visu, blk_lang] in enumerate(zip(self.visu_resblocks, self.textmodel.resblocks)):
+            x = blk_visu(x) # [677, bs, 768]
+            raw_fword = blk_lang(raw_fword)
+            img_cls = self.vis_select(x[0, :, :]) # [B, C]
+            tex_cls = raw_fword[word_id.argmax(dim=-1).reshape(-1), torch.arange(raw_fword.shape[1]), :] # [B, C]
+            score = img_cls * tex_cls # [B, C]
+            score = score.unsqueeze(1) # [B, 1, C]
+            if i >=3 and i <= 5:
+                visu_list_l.append(x)
+                scores_l.append(score)
+            if i>=6 and i <=8:
+                visu_list_m.append(x)
+                scores_m.append(score)
+        scores_l = torch.cat(scores_l, dim=1)  # [B, 3, C]
+        scores_m = torch.cat(scores_m, dim=1)  # [B, 3, C]
+        scores_l = self.feature_selector_l(scores_l).squeeze(-1) # [B, 3]
+        scores_l = F.softmax(scores_l, dim=-1)
+        scores_m = self.feature_selector_m(scores_m).squeeze(-1) # [B, 3]
+        scores_m = F.softmax(scores_m, dim=-1)
+        visu_list_l = torch.cat(visu_list_l, dim=0).reshape(len(visu_list_l), -1, batch_size, self.visu_dim).permute(0,2,1,3)
+        visu_list_m = torch.cat(visu_list_m, dim=0).reshape(len(visu_list_m), -1, batch_size, self.visu_dim).permute(0,2,1,3)
+        x6 = visu_list_l[scores_l.argmax(dim=-1).reshape(-1), torch.arange(visu_list_l.shape[1]), :, :].permute(1,0,2)
+        x9 = visu_list_m[scores_m.argmax(dim=-1).reshape(-1), torch.arange(visu_list_m.shape[1]), :, :].permute(1,0,2)
+        x6 = x6.permute(1, 0, 2)[:, 1:, :].reshape(-1, 26, 26, self.visu_dim).permute(0, 3, 1, 2)
+        x9 = x9.permute(1, 0, 2)[:, 1:, :].reshape(-1, 26, 26, self.visu_dim).permute(0, 3, 1, 2)
+        x12 = x.permute(1, 0, 2)[:, 1:, :]
+        x12 = x12.reshape(-1, 26, 26, self.visu_dim).permute(0, 3, 1, 2) # [bs, 768, 26, 26]
+        raw_fword = raw_fword.permute(1, 0, 2)
+        raw_fword = self.textmodel_ln_final(raw_fword)
+        if not self.tunelang:
+            raw_fword = raw_fword.detach()
+        eos_token = raw_fword[torch.arange(raw_fword.shape[0]), word_id.argmax(dim=-1).reshape(-1), :]
+        F_g = self.fusion(x12, eos_token)
+        F_tf = self.fpn([F_g, x9, x6])
+        # Main body
+        b,  c,  h,  w = F_tf.shape
+        flatten_length = h*w
+        visu_feat = F_tf.reshape(b, c, flatten_length)
+        visu_feat = F.relu(visu_feat)
+        lang_feat = F.relu(self.fc_2(raw_fword))
+        visu_feat = visu_feat.permute(0, 2, 1)
+        pos_embed = self.pos_embedding(visu_feat)
+        visu_feat = visu_feat.transpose(0, 1)
+        pos_embed = pos_embed.transpose(0, 1)
+        visu_feat = self.encoder(visu_feat, pos=pos_embed)
+        #[HW B C]
+        visu_feat_ = visu_feat.permute(1,0,2)
+        # mask decoder
+        visu_feat = visu_feat.reshape(h, w, b, c)
+        visu_feat = visu_feat.permute(2,3,0,1)
+        proto_masks = self.mask_decoder(visu_feat, 2)
+        #[B C H W]
+        proto_masks = F.relu(proto_masks)
+        # coef
+        coef = self.lang_tf_enc(visu_feat_, lang_feat)
+        coef = coef.view(b, h, w, c)
+        coef = coef.permute(0, 3, 1, 2)
+        coef = self.proj1(coef)
+        coef = self.proj2(coef)
+        coef = self.proj3(coef)
+        coef = coef.permute(0, 2, 3, 1)
+        coef = coef.contiguous().view(b, h*w*32)
+        # [b, 1, 32]
+        coef = self.projout(coef).unsqueeze(-1)
+        coef = F.tanh(coef)
+        # mask assemble
+        proto_masks = proto_masks.permute(0, 2, 3, 1)
+        proto_masks = proto_masks.view(b, -1, 32)
+        #[B HW N] [32 208*208 32]
+        mask_out = torch.bmm(proto_masks, coef, out=None)
+        mask_out = mask_out.view(b, 208, 208, 1)
+        mask_out = mask_out.permute(0, 3, 1, 2)
+        return mask_out

ASDA/model/model_sbert_gref.py ADDED Viewed

	@@ -0,0 +1,488 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .modules import ConvBatchNormReLU, SFA
+from .modules import *
+from .position_encoding import *
+import clip
+import math
+import sys
+sys.path.append('../')
+from utils.utils import *
+class Simple_fusion(nn.Module):
+    def __init__(self, visual_dim=1024, text_dim=768, proj_dim=1024, jemb_drop_out=0.1, leaky=True):
+        super(Simple_fusion, self).__init__()
+        self.proj_dim = proj_dim
+        self.mapping_visu = ConvBatchNormReLU(visual_dim, proj_dim, 1, 1, 0, 1, leaky=leaky)
+        self.lang_attn = nn.Sequential(
+            nn.Linear(text_dim, text_dim),
+            nn.Tanh(),
+            nn.Dropout(jemb_drop_out),
+            nn.Softmax(dim=1))
+        self.lang_proj = nn.Sequential(
+            nn.Linear(text_dim, proj_dim),
+            nn.BatchNorm1d(proj_dim),
+            nn.LeakyReLU(0.1))
+        self.fusion = nn.Sequential(
+            nn.BatchNorm2d(proj_dim),
+            nn.LeakyReLU(0.1))
+    def forward(self, visual_feat, lang_feat):
+        # visual proj
+        visual_feat_proj = self.mapping_visu(visual_feat) # [bt, 1024, 13, 13]
+        """
+        # lang attn
+        lang_feat_attn = self.lang_attn(lang_feat) #[bt, 15, 768]
+        lang_feat_new = lang_feat * lang_feat_attn
+        lang_feat_new = lang_feat_new.sum(dim=1) #[bt, 768]
+        """
+        lang_feat = lang_feat.squeeze(1)
+        # lang proj
+        #lang_feat_new = self.lang_proj(lang_feat_new) #[bt, 1024]
+        lang_feat_new = self.lang_proj(lang_feat) #[bt, 1024]
+        # fusion
+        h, w = visual_feat.shape[-2], visual_feat.shape[-1]
+        lang_feat_new_tile = lang_feat_new.view(-1, self.proj_dim, 1, 1).repeat(1, 1, h, w) # [bt, 1024, 13, 13]
+        fusion_feat = lang_feat_new_tile * visual_feat_proj
+        fusion_feat = self.fusion(fusion_feat)
+        return fusion_feat
+class up_proj_cat_proj(nn.Module):
+    def __init__(self, input_1, input_2, do=512, leaky=True):
+        super(up_proj_cat_proj, self).__init__()
+        self.proj1 = ConvBatchNormReLU(input_2, input_2, 1, 1, 0, 1, leaky=leaky)
+        self.proj2 = ConvBatchNormReLU(input_1+input_2, do, 1, 1, 0, 1, leaky=leaky)
+    def forward(self, x, y):
+        x = F.interpolate(x, scale_factor=2, mode='nearest')
+        y = self.proj1(y)
+        out = torch.cat([x,y], dim=1)
+        out = self.proj2(out)
+        return out
+class pool_proj_cat_proj(nn.Module):
+    def __init__(self, input_1, input_2, do=512, leaky=True):
+        super(pool_proj_cat_proj, self).__init__()
+        self.downsample = nn.AvgPool2d(2, 2)
+        self.proj1 = ConvBatchNormReLU(input_2, do // 2,    1, 1, 0, 1, leaky=leaky)
+        self.proj2 = ConvBatchNormReLU(do // 2, do,         3, 1, 1, 1, leaky=leaky)
+        self.proj3 = ConvBatchNormReLU(input_1+do, do,      1, 1, 0, 1, leaky=leaky)
+    def forward(self, x, y):
+        y = self.downsample(y)
+        y = self.proj1(y)
+        y = self.proj2(y)
+        output = self.proj3(torch.cat([x,y], dim=1))
+        return output
+class proj_cat_proj(nn.Module):
+    def __init__(self, input_1, input_2, do=512, leaky=True):
+        super(proj_cat_proj, self).__init__()
+        self.proj1 = ConvBatchNormReLU(input_2, input_2,        1, 1, 0, 1, leaky=leaky)
+        self.proj2 = ConvBatchNormReLU(input_1 + input_2, do,   1, 1, 0, 1, leaky=leaky)
+    def forward(self, x, y):
+        y = self.proj1(y)
+        out = torch.cat([x, y], dim=1)
+        out = self.proj2(out)
+        return out
+class proj_cat(nn.Module):
+    def __init__(self, input_1, input_2, do=512, leaky=True):
+        super(proj_cat, self).__init__()
+        self.proj1 = ConvBatchNormReLU(input_1, do // 2,    1, 1, 0, 1, leaky=leaky)
+        self.proj2 = ConvBatchNormReLU(do // 2, do,         3, 1, 1, 1, leaky=leaky)
+    def forward(self, x, y):
+        x = self.proj1(x)
+        x = self.proj2(x)
+        output = torch.cat([x,y], dim=1)
+        return output
+class mask_decoder(nn.Module):
+    def __init__(self, input_1, seg_out_stride=2, leaky=True):
+        super(mask_decoder, self).__init__()
+        self.proj1 = ConvBatchNormReLU(input_1, input_1//2, 3, 1, 1, 1, leaky=leaky)
+        self.proj2 = ConvBatchNormReLU(input_1//2, input_1//2, 3, 1, 1, 1, leaky=leaky)
+        self.proj3 = ConvBatchNormReLU(input_1//2, input_1//2, 3, 1, 1, 1, leaky=leaky)
+        self.proj4 = ConvBatchNormReLU(input_1//2, input_1//2, 3, 1, 1, 1, leaky=leaky)
+        self.proj5 = ConvBatchNormReLU(input_1//2, input_1//2, 3, 1, 1, 1, leaky=leaky)
+        #self.proj = nn.Conv2d(input_1, 1, 3, 1, 1, 1)
+        self.proj = nn.Conv2d(input_1//2, 32, 3, 1, 1, 1)
+    def forward(self, x, seg_out_stride):
+        x = self.proj1(x)
+        x = self.proj2(x)
+        if seg_out_stride <= 8:
+            x = F.interpolate(x, scale_factor=2, mode='nearest')
+            x = self.proj3(x)
+        if seg_out_stride <= 4:
+            x = F.interpolate(x, scale_factor=2, mode='nearest')
+            x = self.proj4(x)
+        if seg_out_stride <= 2:
+            x = F.interpolate(x, scale_factor=2, mode='nearest')
+            x = self.proj5(x)
+        x = self.proj(x)
+        return x
+# class FeatureSelector(nn.Module):
+#     def __init__(self, img_feature_dim, text_feature_dim, output_dim):
+#         super(FeatureSelector, self).__init__()
+#         # 使用nn.Sequential来简化MLP的构建
+#         self.mlp = nn.Sequential(
+#             nn.Linear(img_feature_dim * 3 + text_feature_dim * 3, 1024),
+#             nn.ReLU(),
+#             nn.Linear(1024, 256),
+#             nn.ReLU(),
+#             nn.Linear(256, output_dim)
+#         )
+#     def forward(self, img_features, text_feature):
+#         # 将图像特征和文本特征拼接
+#         combined_features = torch.cat(img_features + text_feature, dim=1) #
+#         # 通过MLP得到输出得分
+#         scores = self.mlp(combined_features)
+#         return scores
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class ResidualAttentionblk(nn.Module):
+    def __init__(self, clip_module):
+        super().__init__()
+        self.clip_module = clip_module
+        self.selected_tokens = int(676 * 0.8)
+        #self.norm = nn.LayerNorm(768)
+    def forward(self, x: torch.Tensor, attn_mask: torch.Tensor = None, lang_tokens=None, index=0):
+        if lang_tokens is None:
+            x = x + self.clip_module.attention(self.clip_module.ln_1(x))
+        else:
+            #if index >= 4 and index <= 7:
+            #    self.selected_tokens = int (676 * 0.8)
+            #elif index>=8 and index <=11:
+            #    self.selected_tokens = int (676 * 0.5)
+            #print(index)
+            #print(self.selected_tokens)
+            N, B, C = x.shape   # N x B x C
+            cls_x = x[:1, :, :] # 1 x B x C
+            x = x[1:, :, :]     # M x B x C
+            ###img_cls text_cls
+            #x = torch.mul(x, cls_x)
+            #x = self.norm(x.reshape((N-1)*B, C))
+            #x = x.reshape(N-1, B, C)
+            ### text eos token
+            #score = torch.bmm(x.transpose(0,1), lang_tokens).squeeze(-1)
+            ### text features mean
+            score = torch.bmm(x.transpose(0, 1), lang_tokens.permute(1, 2, 0)).mean(dim=-1)   # B x N
+            score = score.transpose(0, 1)   # N x B
+            sorted_scores, sorted_indices = torch.sort(score, descending=True, dim=0)
+            # high_mask = sorted_scores > sorted_scores[self.selected_tokens:self.selected_tokens+1, :]
+            high_mask = torch.ones_like(sorted_scores)
+            for i in range(B):
+                high_mask[sorted_indices[self.selected_tokens:, i], i] = 0
+            high_mask = high_mask > 0.5
+            delta_x = x[high_mask].reshape(-1, B, C)        # M x B x C
+            low_x = x[~high_mask].reshape(-1, B, C)         # N-M x B x C
+            low_score = score[~high_mask].reshape(-1, B, 1) # N-M x B x 1
+            low_x = low_x * torch.softmax(low_score, dim=0) # N-M x B x C
+            low_x = low_x.sum(dim=0, keepdim=True)          # 1 x B x C
+            delta_x = torch.cat([cls_x, delta_x, low_x], dim=0) # M+1 x B x C
+            delta_x = self.clip_module.attention(self.clip_module.ln_1(delta_x))
+            # for i in range(B):
+            #     x[high_mask[:, i], i, :] += delta_x[1:-1, i, :]
+            #     x[~high_mask[:, i], i, :] += delta_x[-1:, i, :]
+            #     cls_x[:, i] += delta_x[:1, i, :]
+            temple = torch.zeros_like(x).type(delta_x.type())
+            temple[high_mask] = delta_x[1:-1, :, :].reshape(-1, C)
+            temple[~high_mask] = delta_x[-1:, :, :].reshape(-1, 1, C).repeat(1, 676 - self.selected_tokens, 1).reshape(-1, C)
+            x = x + temple
+            cls_x = cls_x + delta_x[:1, :, :]
+            x = torch.cat([cls_x, x], dim=0)
+        x = x + self.clip_module.mlp(self.clip_module.ln_2(x))
+        return x
+class Model_CL(nn.Module):
+    def __init__(self, clip_model='RN50', tunelang=False, fusion_dim=2048, num_query=16, do=512, leaky=True, length=17, fuse_mode='coarse', use_projections=False):
+        super(Model_CL, self).__init__()
+        self.tunelang = tunelang
+        self.length = length
+        ## Init Encoders
+        clip_models = clip.load(clip_model, jit=False, device=torch.device("cpu"))[0].cuda()
+        self.visumodel = clip_models.visual
+        self.visu_dim = 768
+        self.fuse_mode = fuse_mode
+        self.cut_list = []
+        self.visu_resblocks = nn.ModuleList([ResidualAttentionblk(self.visumodel.transformer.resblocks[i]) for i in range(12)])
+        self.visu_proj = nn.ModuleList([nn.Linear(do, self.visu_dim) for _ in range(len(self.cut_list))])
+        self.positional_embedding = nn.Parameter(torch.FloatTensor(1, 26 ** 2 + 1, 768))
+        v = self.resize_pos_embed(self.visumodel.positional_embedding.data.unsqueeze(0), self.positional_embedding, 26, 26)
+        self.positional_embedding.data.copy_(v)
+        self.textmodel = clip_models.transformer
+        self.textmodel_token_embedding = clip_models.token_embedding
+        self.textmodel_pos_embed = nn.Parameter(clip_models.positional_embedding[:self.length, :].unsqueeze(0))
+        self.textmodel_ln_final = clip_models.ln_final
+        self.textdim = self.textmodel_pos_embed.shape[-1]
+        for module in self.textmodel.resblocks:
+            module.attn_mask = self.build_attention_mask()
+        # vis select
+        self.vis_select = nn.Linear(self.visu_dim, do, bias=False)
+        ## Fusion
+        # fusion with x12
+        self.fusion = Simple_fusion(visual_dim=self.visu_dim, text_dim=self.textdim, proj_dim=fusion_dim)
+        # fusion with x6
+        self.up_proj_cat_proj_1 = proj_cat_proj(input_1=fusion_dim, input_2=self.visu_dim, do=fusion_dim)
+        self.pool_proj_cat_proj_2 = proj_cat_proj(input_1=fusion_dim, input_2=self.visu_dim, do=do)
+        # fusion with x9
+        self.proj_cat = proj_cat(input_1=fusion_dim, input_2=do, do=do)
+        self.up_proj_cat_2 = proj_cat_proj(input_1=fusion_dim, input_2=do * 2, do=do)
+        self.proj_0 = ConvBatchNormReLU(do, do, 1, 1, 0, 1, leaky=leaky)
+        self.fpn = SFA(in_channels=self.visu_dim, out_channels=do)
+        ## use projections?
+        self.use_projections = use_projections
+        if self.use_projections :
+            self.projection_1 = nn.Linear(512, 512, bias=True)
+        else :
+            self.projection_1 = None
+        ## Align dim
+        f_dim = 512
+        self.fc_2 = nn.Linear(f_dim, f_dim, bias=False)
+        self.norm1 = nn.LayerNorm(f_dim)
+        self.norm2 = nn.LayerNorm(f_dim)
+        # visual branch
+        self.pos_embedding = PositionEmbeddingSine(f_dim)
+        encoder_layer = TransformerEncoderLayer(f_dim, nhead=8, dim_feedforward=f_dim,
+                                                dropout=0.1, activation='relu', normalize_before=False)
+        self.encoder = TransformerEncoder(encoder_layer, num_layers=2, norm=nn.LayerNorm(f_dim))
+        ## Decoder
+        self.mask_decoder = mask_decoder(f_dim, seg_out_stride=2)
+        # text branch
+        ## coef
+        self.lang_tf_enc = lang_tf_enc(do, do, do, head_num=8)
+        self.proj1 = ConvBatchNormReLU(do, do, 3, 1, 1, 1, leaky=leaky)
+        self.proj2 = ConvBatchNormReLU(do, do, 3, 1, 1, 1, leaky=leaky)
+        self.proj3 = nn.Conv2d(do, 32, 3, 1, 1, 1)
+        self.projout = nn.Linear(26*26*32, 32, bias=False)
+        self.feature_selector_l = nn.Linear(do, 1, bias=True)
+        self.feature_selector_m = nn.Linear(do, 1, bias=True)
+    def resize_pos_embed(self, posemb, posemb_new, hight, width):
+        ntok_new = posemb_new.shape[1]
+        posemb_token, posemb_grid = posemb[:, :1], posemb[0, 1:]
+        ntok_new -= 1
+        gs_old = int(math.sqrt(len(posemb_grid)))
+        print('Resized position embedding from size:{} to size: {} with height:{} width: {}'.format(posemb.shape, posemb_new.shape, hight, width))
+        posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
+        posemb_grid = F.interpolate(posemb_grid, size=(hight, width), mode='bilinear')
+        posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, hight * width, -1)
+        posemb = torch.cat([posemb_token, posemb_grid], dim=1)
+        return posemb
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.length, self.length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    def forward(self, image, word_id, word_mask):
+        ## Visual Module
+        batch_size = image.size(0)
+        # Extract features from vision
+        x = self.visumodel.conv1(image)
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x = torch.cat([self.visumodel.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.visumodel.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        raw_fword = self.textmodel_token_embedding(word_id).squeeze(1)
+        raw_fword = raw_fword + self.textmodel_pos_embed
+        raw_fword = raw_fword.permute(1, 0, 2) # NLD -> LND
+        visu_list_l = []
+        visu_list_m = []
+        scores_l = []
+        scores_m = []
+        for i, [blk_visu, blk_lang] in enumerate(zip(self.visu_resblocks, self.textmodel.resblocks)):
+            x = blk_visu(x) # [677, bs, 768]
+            raw_fword = blk_lang(raw_fword)
+            img_cls = self.vis_select(x[0, :, :]) # [B, C]
+            tex_cls = raw_fword[word_id.argmax(dim=-1).reshape(-1), torch.arange(raw_fword.shape[1]), :] # [B, C]
+            score = img_cls * tex_cls # [B, C]
+            score = score.unsqueeze(1) # [B, 1, C]
+            if i >=3 and i <= 5:
+                visu_list_l.append(x)
+                scores_l.append(score)
+            if i>=6 and i <=8:
+                visu_list_m.append(x)
+                scores_m.append(score)
+        scores_l = torch.cat(scores_l, dim=1)  # [B, 3, C]
+        scores_m = torch.cat(scores_m, dim=1)  # [B, 3, C]
+        scores_l = self.feature_selector_l(scores_l).squeeze(-1) # [B, 3]
+        scores_l = F.softmax(scores_l, dim=-1)
+        scores_m = self.feature_selector_m(scores_m).squeeze(-1) # [B, 3]
+        scores_m = F.softmax(scores_m, dim=-1)
+        visu_list_l = torch.cat(visu_list_l, dim=0).reshape(len(visu_list_l), -1, batch_size, self.visu_dim).permute(0,2,1,3)
+        visu_list_m = torch.cat(visu_list_m, dim=0).reshape(len(visu_list_m), -1, batch_size, self.visu_dim).permute(0,2,1,3)
+        x6 = visu_list_l[scores_l.argmax(dim=-1).reshape(-1), torch.arange(visu_list_l.shape[1]), :, :].permute(1,0,2)
+        x9 = visu_list_m[scores_m.argmax(dim=-1).reshape(-1), torch.arange(visu_list_m.shape[1]), :, :].permute(1,0,2)
+        x6 = x6.permute(1, 0, 2)[:, 1:, :].reshape(-1, 26, 26, self.visu_dim).permute(0, 3, 1, 2)
+        x9 = x9.permute(1, 0, 2)[:, 1:, :].reshape(-1, 26, 26, self.visu_dim).permute(0, 3, 1, 2)
+        x12 = x.permute(1, 0, 2)[:, 1:, :]
+        x12 = x12.reshape(-1, 26, 26, self.visu_dim).permute(0, 3, 1, 2) # [bs, 768, 26, 26]
+        raw_fword = raw_fword.permute(1, 0, 2)
+        raw_fword = self.textmodel_ln_final(raw_fword)
+        if not self.tunelang:
+            raw_fword = raw_fword.detach()
+        eos_token = raw_fword[torch.arange(raw_fword.shape[0]), word_id.argmax(dim=-1).reshape(-1), :]
+        F_g = self.fusion(x12, eos_token)
+        F_tf = self.fpn([F_g, x9, x6])
+        # Main body
+        b,  c,  h,  w = F_tf.shape
+        flatten_length = h*w
+        visu_feat = F_tf.reshape(b, c, flatten_length)
+        visu_feat = F.relu(visu_feat)
+        lang_feat = F.relu(self.fc_2(raw_fword))
+        visu_feat = visu_feat.permute(0, 2, 1)
+        pos_embed = self.pos_embedding(visu_feat)
+        visu_feat = visu_feat.transpose(0, 1)
+        pos_embed = pos_embed.transpose(0, 1)
+        visu_feat = self.encoder(visu_feat, pos=pos_embed)
+        #[HW B C]
+        visu_feat_ = visu_feat.permute(1,0,2)
+        # mask decoder
+        visu_feat = visu_feat.reshape(h, w, b, c)
+        visu_feat = visu_feat.permute(2,3,0,1)
+        F_coarse_refined = visu_feat
+        proto_masks = self.mask_decoder(visu_feat, 2)
+        #[B C H W]
+        proto_masks = F.relu(proto_masks)
+        # coef
+        coef = self.lang_tf_enc(visu_feat_, lang_feat)
+        coef = coef.view(b, h, w, c)
+        coef = coef.permute(0, 3, 1, 2)
+        F_fine = coef
+        coef = self.proj1(coef)
+        coef = self.proj2(coef)
+        coef = self.proj3(coef)
+        coef = coef.permute(0, 2, 3, 1)
+        coef = coef.contiguous().view(b, h*w*32)
+        # [b, 1, 32]
+        coef = self.projout(coef).unsqueeze(-1)
+        coef = F.tanh(coef)
+        # mask assemble
+        proto_masks = proto_masks.permute(0, 2, 3, 1)
+        proto_masks = proto_masks.view(b, -1, 32)
+        #[B HW N] [32 208*208 32]
+        mask_out = torch.bmm(proto_masks, coef, out=None)
+        mask_out = mask_out.view(b, 208, 208, 1)
+        mask_out = mask_out.permute(0, 3, 1, 2)
+        if self.fuse_mode == 'coarse' :
+            metric_tensor = F_tf
+        elif self.fuse_mode == 'refined_coarse' :
+            metric_tensor = F_coarse_refined
+        elif self.fuse_mode == 'fine' :
+            metric_tensor = F_fine
+        if self.use_projections :
+            metric_tensor = F.adaptive_avg_pool2d(metric_tensor, (1, 1)).view(metric_tensor.size(0), -1)
+            metric_tensor = self.projection_1(metric_tensor).unsqueeze(-1).unsqueeze(-1)
+        return mask_out, metric_tensor

ASDA/model/modules.py ADDED Viewed

	@@ -0,0 +1,391 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .transformer import lang_tf_enc, TransformerEncoderLayer, TransformerEncoder
+from .position_encoding import PositionEmbeddingSine
+class SFA(nn.Module):
+    def __init__(self, in_channels, out_channels, scale_factors = [1, 2, 4], fuse_type="sum"):
+        super(SFA, self).__init__()
+        self.stages = []
+        for idx, scale in enumerate(scale_factors):
+            out_dim = out_channels
+            if scale == 4.0:
+                layers = [
+                    nn.ConvTranspose2d(in_channels, in_channels // 2, kernel_size=2, stride=2),
+                    nn.BatchNorm2d(
+                    num_features=in_channels // 2, eps=1e-5, momentum=0.999, affine=True),
+                    nn.GELU(),
+                    nn.ConvTranspose2d(in_channels // 2, in_channels // 4, kernel_size=2, stride=2),
+                ]
+                out_dim = in_channels // 4
+            elif scale == 2.0:
+                layers = [nn.ConvTranspose2d(in_channels, in_channels // 2, kernel_size=2, stride=2)]
+                out_dim = in_channels // 2
+            elif scale == 1.0:
+                layers = []
+                out_dim = in_channels
+            elif scale == 0.5:
+                layers = [nn.MaxPool2d(kernel_size=2, stride=2)]
+            else:
+                raise NotImplementedError(f"scale_factor={scale} is not supported yet.")
+            layers.extend(
+                [
+                    ConvBatchNormReLU(out_dim, out_channels, 1, 1, 0, 1, leaky=True),
+                    ConvBatchNormReLU(out_channels, out_channels, 3, 1, 1, 1, leaky=True),
+                ]
+            )
+            layers = nn.Sequential(*layers)
+            self.stages.append(layers)
+        self.stages = nn.ModuleList(self.stages)
+        # 假设所有输入特征图的通道数相同
+        self.lateral_convs = nn.ModuleList([
+            ConvBatchNormReLU(out_channels, out_channels, 1, 1, 0, 1, leaky=True) for _ in range(3)
+        ])
+        self.output_convs = nn.ModuleList([
+            ConvBatchNormReLU(out_channels, out_channels, 3, 1, 1, 1, leaky=True) for _ in range(3)
+        ])
+        self._fuse_type = fuse_type  # or "avg"
+        self.downsample = nn.MaxPool2d(kernel_size=4, stride=4, padding=0)
+    def forward(self, x):
+        '''
+        Args:
+            x: list[Tensor], T个特征图，每个特征图的尺寸和通道数相同，[x12, x9, x6]
+        '''
+        # 模拟bottom-up, 获取多尺度特征图
+        mutil_scale_features = []
+        for idx, stage in enumerate(self.stages):
+            mutil_scale_features.append(stage(x[idx]))
+        # top-down
+        results = []
+        prev_features = self.lateral_convs[0](mutil_scale_features[0])
+        for idx, (lateral_conv, output_conv) in enumerate(
+            zip(self.lateral_convs, self.output_convs)
+        ):
+            # Slicing of ModuleList is not supported https://github.com/pytorch/pytorch/issues/47336
+            # Therefore we loop over all modules but skip the first one
+            if idx > 0:
+                features = mutil_scale_features[idx]
+                top_down_features = F.interpolate(prev_features, scale_factor=2.0, mode="nearest")
+                lateral_features = lateral_conv(features) # 1x1卷积
+                prev_features = lateral_features + top_down_features
+                if self._fuse_type == "avg":
+                    prev_features /= 2
+                results.insert(0, output_conv(prev_features))
+        fused_features = self.downsample(results[0]) # 1/4分辨率，需要转换为1/16分辨率
+        return fused_features
+class ConvBatchNormReLU(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        leaky=False,
+        relu=True,
+        instance=False,
+    ):
+        super(ConvBatchNormReLU, self).__init__()
+        self.conv = nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                bias=False)
+        # nn.init.kaiming_normal_(self.conv.weight, mode="fan_out", nonlinearity="leaky_relu" if leaky else "relu")
+        if instance:
+            self.bn = nn.InstanceNorm2d(num_features=out_channels)
+        else:
+            self.bn = nn.BatchNorm2d(
+                    num_features=out_channels, eps=1e-5, momentum=0.999, affine=True
+                )
+        if leaky:
+            self.relu = nn.LeakyReLU(0.1)
+        elif relu:
+            self.relu = nn.ReLU()
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+# class ConvBatchNormReLU(nn.Sequential):
+#     def __init__(
+#         self,
+#         in_channels,
+#         out_channels,
+#         kernel_size,
+#         stride,
+#         padding,
+#         dilation,
+#         leaky=False,
+#         relu=True,
+#         instance=False,
+#     ):
+#         super(ConvBatchNormReLU, self).__init__()
+#         conv = nn.Conv2d(
+#                 in_channels=in_channels,
+#                 out_channels=out_channels,
+#                 kernel_size=kernel_size,
+#                 stride=stride,
+#                 padding=padding,
+#                 dilation=dilation,
+#                 bias=False,
+#         )
+#         nn.init.kaiming_normal_(conv.weight, mode="fan_out", nonlinearity="leaky_relu" if leaky else "relu")
+#         self.add_module(
+#             "conv", conv
+#         )
+#         if instance:
+#             self.add_module(
+#                 "bn",
+#                 nn.InstanceNorm2d(num_features=out_channels),
+#             )
+#         else:
+#             self.add_module(
+#                 "bn",
+#                 nn.BatchNorm2d(
+#                     num_features=out_channels, eps=1e-5, momentum=0.999, affine=True
+#                 ),
+#             )
+#         if leaky:
+#             self.add_module("relu", nn.LeakyReLU(0.1))
+#         elif relu:
+#             self.add_module("relu", nn.ReLU())
+#     def forward(self, x):
+#         return super(ConvBatchNormReLU, self).forward(x)
+def concat_coord(x):
+    ins_feat = x  # [bt, c, h, w] [512, 26, 26]
+    batch_size, c, h, w = x.size()
+    float_h = float(h)
+    float_w = float(w)
+    y_range = torch.arange(0., float_h, dtype=torch.float32)
+    y_range = 2.0 * y_range / (float_h - 1.0) - 1.0
+    x_range = torch.arange(0., float_w, dtype=torch.float32)
+    x_range = 2.0 * x_range / (float_w - 1.0) - 1.0
+    x_range = x_range[None, :]
+    y_range = y_range[:, None]
+    x = x_range.repeat(h, 1)
+    y = y_range.repeat(1, w)
+    x = x[None, None, :, :]
+    y = y[None, None, :, :]
+    x = x.repeat(batch_size, 1, 1, 1)
+    y = y.repeat(batch_size, 1, 1, 1)
+    x = x.cuda()
+    y = y.cuda()
+    ins_feat_out = torch.cat((ins_feat, x, x, x, y, y, y), 1)
+    return ins_feat_out
+class query_generator(nn.Module):
+    def __init__(self, input, output, leaky=True):
+        super(query_generator, self).__init__()
+        self.proj1 = ConvBatchNormReLU(input+6, input+6, 3, 1, 1, 1, leaky=leaky)
+        self.proj2 = ConvBatchNormReLU(input+6, input+6, 3, 1, 1, 1, leaky=leaky)
+        self.proj3 = ConvBatchNormReLU(input+6, input+6, 3, 1, 1, 1, leaky=leaky)
+        self.proj = nn.Conv2d(input+6, output, 1, 1, 0, 1)
+    def forward(self, x):
+        x = concat_coord(x)
+        x = x + self.proj1(x)
+        x = x + self.proj2(x)
+        x = x + self.proj3(x)
+        x = self.proj(x)
+        return x
+class KLM(nn.Module):
+    def __init__(self, f_dim, feat_dim):
+        super(KLM, self).__init__()
+        self.lang_tf_enc = lang_tf_enc(f_dim, f_dim, f_dim, head_num=8)
+        self.pos_embedding = PositionEmbeddingSine(f_dim)
+        encoder_layer = TransformerEncoderLayer(f_dim, nhead=8, dim_feedforward=f_dim,
+                                                dropout=0.1, activation='relu', normalize_before=False)
+        self.encoder = TransformerEncoder(encoder_layer, num_layers=2, norm=nn.LayerNorm(f_dim))
+        # self.catproj = nn.Linear(f_dim * 2, f_dim)
+        self.fc_ker = nn.Linear(f_dim, feat_dim + feat_dim)
+        self.fc_vis = nn.Linear(f_dim, feat_dim + feat_dim)
+        self.ker_norm = nn.LayerNorm(feat_dim)
+        self.vis_norm = nn.LayerNorm(feat_dim)
+        self.channel_fc = nn.Linear(feat_dim, feat_dim)
+        self.channel_norm = nn.LayerNorm(feat_dim)
+        self.spatial_fc = nn.Linear(feat_dim, feat_dim)
+        self.spatial_norm = nn.LayerNorm(feat_dim)
+        self.out_fc = nn.Linear(feat_dim, f_dim)
+        self.out_norm = nn.LayerNorm(f_dim)
+        self.d_model = f_dim
+        self.feat_dim = feat_dim
+        self.resolution_size = 26
+    def forward(self, kernel, lang_feat, visu_feat):
+        # kernel    B x N x C
+        # lang_feat B x T x C
+        # visu_feat B x C x HW
+        kernel = self.lang_tf_enc(kernel, lang_feat)
+        # B x N x C
+        bs, c, hw = visu_feat.shape
+        bq, nq, cq = kernel.shape
+        bl, ll, cl = lang_feat.shape
+        # Image Attention
+        visu_feat = visu_feat.permute(0, 2, 1)
+        # B x HW x C
+        pos_embed = self.pos_embedding(visu_feat)
+        # B x HW x C
+        visu_feat = visu_feat.transpose(0, 1)
+        pos_embed = pos_embed.transpose(0, 1)
+        visu_feat_ = self.encoder(visu_feat, pos=pos_embed)  # HW x B x C
+        visu_feat_ = visu_feat_.transpose(0, 1)     # B x HW x C
+        # repeat visual feats
+        visu_feat = visu_feat_.unsqueeze(dim=1) # B x 1 x HW x C
+        kernel = kernel.unsqueeze(dim=2)        # B x N x  1 x C
+        lang_feat = lang_feat.unsqueeze(dim=2)  # B x Q x  1 x C
+        kernel_in = self.fc_ker(kernel)
+        kernel_out = kernel_in[:, :, :, self.feat_dim:]
+        kernel_in =  kernel_in[:, :, :, :self.feat_dim]
+        vis_in = self.fc_vis(visu_feat)
+        vis_out = vis_in[:, :, :, self.feat_dim:]
+        vis_in  = vis_in[:, :, :, :self.feat_dim]
+        gate_feat = self.ker_norm(kernel_in) * self.vis_norm(vis_in)
+        #[B N HW 64]
+        channel_gate = self.channel_norm(self.channel_fc(gate_feat))
+        channel_gate = channel_gate.mean(2, keepdim=True)
+        channel_gate = torch.sigmoid(channel_gate)
+        # B x N x 1 x C
+        spatial_gate = self.spatial_norm(self.spatial_fc(gate_feat))
+        # spatial_gate = spatial_gate.mean(3, keepdim=True)
+        spatial_gate = torch.sigmoid(spatial_gate)
+        # B x N x HW x C
+        channel_gate = (1 + channel_gate) * kernel_out      # B x N x 1 x C
+        channel_gate = channel_gate.squeeze(2)              # B x N x C
+        spatial_gate = (1 + spatial_gate) * vis_out         # B x N x HW x C
+        spatial_gate = spatial_gate.mean(2)                 # B x N x C
+        gate_feat = (channel_gate + spatial_gate) / 2
+        # [B N 64]
+        gate_feat = self.out_fc(gate_feat)
+        gate_feat = self.out_norm(gate_feat)
+        gate_feat = F.relu(gate_feat)
+        #[B N C]
+        #visu_feat_.transpose(1, 2) [B C HW]
+        return gate_feat, visu_feat_.transpose(1, 2)
+class KAM(nn.Module):
+    def __init__(self, f_dim, num_query):
+        super(KAM, self).__init__()
+        self.k_size = 1
+        self.proj = nn.Linear(26*26, f_dim)
+        self.fc_k = nn.Linear(f_dim, f_dim)
+        self.fc_m = nn.Linear(f_dim, f_dim)
+        self.fc_fus = nn.Linear(f_dim * 2, f_dim)
+        self.fc_out = nn.Linear(f_dim, 1)
+        self.outproj = ConvBatchNormReLU(num_query, f_dim, 3, 1, 1, 1, leaky=True)
+        self.maskproj = nn.Conv2d(f_dim, 1, 3, 1, 1, 1)
+        self.bn = nn.BatchNorm2d(f_dim)
+        self.mask_fcs = []
+        for _ in range(3):
+            self.mask_fcs.append(nn.Linear(f_dim, f_dim, bias=False))
+            self.mask_fcs.append(nn.LayerNorm(f_dim))
+            self.mask_fcs.append(nn.ReLU())
+        self.mask_fcs = nn.Sequential(*self.mask_fcs)
+    def forward(self, kernel, visu_feat):
+        # kernel [B N C]
+        # visu_feat [B C HW]
+        kernel = self.mask_fcs(kernel)
+        B, N, C = kernel.shape
+        kernel_ = kernel
+        kernel = kernel.reshape(B, N, -1, C).permute(0, 1, 3, 2)    # B x N x C x 1
+        kernel = kernel.reshape(B, N, C, self.k_size, self.k_size)  # B x N x C x 1 x 1
+        #[B N C K K]
+        visu_feat_ = visu_feat
+        visu_feat = visu_feat.reshape(B, C, 26, 26)                 # B x C x H x W
+        masks = []
+        for i in range(B):
+            masks.append(F.conv2d(visu_feat[i: i+1], kernel[i], padding=int(self.k_size // 2)))   # 1 x N x H x W
+        masks = torch.cat(masks, dim=0)     # B x N x H x W
+        feats = masks.reshape(B, N, -1)     # B x N x HW
+        feats = self.proj(feats)            # B x N x C
+        weights_kern = F.relu(self.fc_k(kernel_))
+        weights_mask = F.relu(self.fc_m(feats))
+        weights = torch.cat([weights_kern, weights_mask], dim=-1)   # B x N x 2C
+        weights = F.relu(self.fc_fus(weights))                      # B x N x C
+        weights = self.fc_out(weights)                              # B x N x 1
+        weights = F.softmax(weights, dim=1)                         # B x N x 1
+        weights = weights.unsqueeze(-1)     # B x N x 1 x 1
+        mask = weights * masks              # B x N x H x W
+        mask = self.outproj(mask)           # B x C x H x W
+        mask = self.maskproj(mask)
+        mask = F.sigmoid(mask)              # B x 1 x H x W
+        visu_feat = visu_feat * mask        # B x C x H x W
+        visu_feat = self.bn(visu_feat)
+        visu_feat = visu_feat.reshape(B, C, -1) + visu_feat_
+        visu_feat = F.relu(visu_feat)
+        return visu_feat

ASDA/model/position_encoding.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Various positional encodings for the transformer.
+"""
+import math
+import torch
+from torch import nn
+class PositionEmbeddingSine(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats // 2
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+    def forward(self, f_s):
+        not_mask = torch.ones_like(f_s[:, :, 0].reshape(-1, 26, 26))
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=f_s.device)
+        dim_t = self.temperature ** (2 * torch.div(dim_t, 2,rounding_mode = 'floor')/ self.num_pos_feats)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).reshape_as(f_s)
+        return pos

ASDA/model/transformer.py ADDED Viewed

	@@ -0,0 +1,251 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+DETR Transformer class.
+Copy-paste from torch.nn.Transformer with modifications:
+    * positional encodings are passed in MHattention
+    * extra LN at the end of encoder is removed
+    * decoder returns a stack of activations from all decoding layers
+"""
+import copy
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+from .position_encoding import *
+class lang_tf_enc(nn.Module):
+    def __init__(self, input_1, input_2, hidden_dim, head_num, dropout=0.1):
+        super(lang_tf_enc, self).__init__()
+        self.pos_embedding_1 = PositionEmbeddingSine(input_2, normalize=True)
+        self.pos_embedding_2 = PositionEmbeddingSine(input_1, normalize=True)
+        self.dense_q = nn.Linear(input_1, hidden_dim)
+        self.dense_k = nn.Linear(input_2, hidden_dim)
+        self.dense_v = nn.Linear(input_2, hidden_dim)
+        self.self_attn = nn.MultiheadAttention(hidden_dim, head_num, dropout=dropout)
+        self.forward_dim = 2048
+        self.norm1 = nn.LayerNorm(hidden_dim)
+        self.norm2 = nn.LayerNorm(hidden_dim)
+        self.linear1 = nn.Linear(hidden_dim, self.forward_dim)
+        self.linear2 = nn.Linear(self.forward_dim, hidden_dim)
+        self.activation = _get_activation("relu")
+        self.dropout = nn.Dropout(dropout)
+    # @get_local("weights")
+    def forward(self, vision_input, lang_input):
+        decoder_embed_lang = lang_input
+        decoder_embed_vis = vision_input
+        q_inp = F.relu(self.dense_q(decoder_embed_vis).permute(1, 0, 2))
+        k_inp = F.relu(self.dense_k(decoder_embed_lang).permute(1, 0, 2))
+        v_inp = F.relu(self.dense_v(decoder_embed_lang).permute(1, 0, 2))
+        lang_input = lang_input.permute(1, 0, 2)
+        decoded_layer, weights = self.self_attn(q_inp, k_inp, v_inp)
+        decoded_layer = decoded_layer.permute(1, 0, 2)
+        add_layer = decoded_layer + vision_input
+        add_layer = self.norm1(add_layer)
+        add_layer2 = self.linear2(self.dropout(self.activation(self.linear1(add_layer))))
+        add_layer = add_layer + self.dropout(add_layer2)
+        add_layer = self.norm2(add_layer)
+        return add_layer
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+def _get_activation(activation):
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(F"activation shuld be relu/gelu, not {activation}.")
+class TransformerEncoder(nn.Module):
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super().__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+    def forward(self, src, pos: Optional[Tensor] = None):
+        output = src
+        for layer in self.layers:
+            output = layer(output, pos=pos)
+        if self.norm is not None:
+            output = self.norm(output)
+        return output
+class TransformerDecoder(nn.Module):
+    def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
+        super().__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+    def forward(self, tgt, memory, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None):
+        output = tgt
+        intermediate = []
+        for layer in self.layers:
+            output = layer(output, memory, pos=pos, query_pos=query_pos)
+            if self.return_intermediate:
+                intermediate.append(self.norm(output))
+        if self.norm is not None:
+            output = self.norm(output)
+            if self.return_intermediate:
+                intermediate.pop()
+                intermediate.append(output)
+        if self.return_intermediate:
+            return torch.stack(intermediate)
+        return output
+class TransformerEncoderLayer(nn.Module):
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
+                 activation="relu", normalize_before=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+    # @get_local("weights")
+    def forward_post(self, src, pos: Optional[Tensor] = None):
+        q = k = self.with_pos_embed(src, pos)
+        src2, weights = self.self_attn(q, k, value=src, need_weights=False)
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
+    def forward_pre(self, src, pos: Optional[Tensor] = None):
+        src2 = self.norm1(src)
+        q = k = self.with_pos_embed(src2, pos)
+        src2, weights = self.self_attn(q, k, value=src2)
+        src = src + self.dropout1(src2)
+        src2 = self.norm2(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
+        src = src + self.dropout2(src2)
+        return src
+    def forward(self, src, pos: Optional[Tensor] = None):
+        if self.normalize_before:
+            return self.forward_pre(src, pos)
+        return self.forward_post(src, pos)
+class TransformerDecoderLayer(nn.Module):
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
+                 activation="relu", normalize_before=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+    def forward_post(self, tgt, memory, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None):
+        q = k = self.with_pos_embed(tgt, query_pos)
+        tgt2, weights = self.self_attn(q, k, value=tgt)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+        tgt2, weights = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
+                                   key=self.with_pos_embed(memory, pos),
+                                   value=memory)
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout3(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+    def forward_pre(self, tgt, memory, pos: Optional[Tensor] = None,
+                    query_pos: Optional[Tensor] = None):
+        tgt2 = self.norm1(tgt)
+        q = k = self.with_pos_embed(tgt2, query_pos)
+        tgt2, weights = self.self_attn(q, k, value=tgt2)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt2 = self.norm2(tgt)
+        tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
+                                   key=self.with_pos_embed(memory, pos),
+                                   value=memory)
+        tgt = tgt + self.dropout2(tgt2)
+        tgt2 = self.norm3(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
+        tgt = tgt + self.dropout3(tgt2)
+        return tgt
+    def forward(self, tgt, memory, pos: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        if self.normalize_before:
+            return self.forward_pre(tgt, memory, pos, query_pos)
+        return self.forward_post(tgt, memory, pos, query_pos)
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")