Update weights

Browse files

Files changed (18) hide show

__pycache__/configuration_earthmind_chat.cpython-310.pyc +0 -0
__pycache__/configuration_intern_vit.cpython-310.pyc +0 -0
__pycache__/configuration_internlm2.cpython-310.pyc +0 -0
__pycache__/configuration_phi3.cpython-310.pyc +0 -0
__pycache__/flash_attention.cpython-310.pyc +0 -0
__pycache__/modeling_earthmind_chat.cpython-310.pyc +0 -0
__pycache__/modeling_intern_vit.cpython-310.pyc +0 -0
__pycache__/modeling_internlm2.cpython-310.pyc +0 -0
__pycache__/modeling_phi3.cpython-310.pyc +0 -0
__pycache__/sam2.cpython-310.pyc +0 -0
__pycache__/templates.cpython-310.pyc +0 -0
config.json +1 -1
model-00001-of-00004.safetensors +2 -2
model-00002-of-00004.safetensors +1 -1
model-00003-of-00004.safetensors +1 -1
model-00004-of-00004.safetensors +1 -1
model.safetensors.index.json +1 -2
modeling_earthmind_chat.py +93 -25

__pycache__/configuration_earthmind_chat.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/configuration_earthmind_chat.cpython-310.pyc and b/__pycache__/configuration_earthmind_chat.cpython-310.pyc differ

__pycache__/configuration_intern_vit.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/configuration_intern_vit.cpython-310.pyc and b/__pycache__/configuration_intern_vit.cpython-310.pyc differ

__pycache__/configuration_internlm2.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/configuration_internlm2.cpython-310.pyc and b/__pycache__/configuration_internlm2.cpython-310.pyc differ

__pycache__/configuration_phi3.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/configuration_phi3.cpython-310.pyc and b/__pycache__/configuration_phi3.cpython-310.pyc differ

__pycache__/flash_attention.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/flash_attention.cpython-310.pyc and b/__pycache__/flash_attention.cpython-310.pyc differ

__pycache__/modeling_earthmind_chat.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/modeling_earthmind_chat.cpython-310.pyc and b/__pycache__/modeling_earthmind_chat.cpython-310.pyc differ

__pycache__/modeling_intern_vit.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/modeling_intern_vit.cpython-310.pyc and b/__pycache__/modeling_intern_vit.cpython-310.pyc differ

__pycache__/modeling_internlm2.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/modeling_internlm2.cpython-310.pyc and b/__pycache__/modeling_internlm2.cpython-310.pyc differ

__pycache__/modeling_phi3.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/modeling_phi3.cpython-310.pyc and b/__pycache__/modeling_phi3.cpython-310.pyc differ

__pycache__/sam2.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/sam2.cpython-310.pyc and b/__pycache__/sam2.cpython-310.pyc differ

__pycache__/templates.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/templates.cpython-310.pyc and b/__pycache__/templates.cpython-310.pyc differ

config.json CHANGED Viewed

@@ -102,7 +102,7 @@
   "select_layer": -1,
   "template": "phi3_chat",
   "tie_word_embeddings": false,
-  "torch_dtype": "float32",
   "transformers_version": null,
   "use_backbone_lora": 0,
   "use_llm_lora": 0,

   "select_layer": -1,
   "template": "phi3_chat",
   "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
   "transformers_version": null,
   "use_backbone_lora": 0,
   "use_llm_lora": 0,

model-00001-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:39e63f528727404f1bcdbf73e14d96ed8d27e98ed19c3fe9bbced85c0604130a
-size 4971490432

 version https://git-lfs.github.com/spec/v1
+oid sha256:ffb2ee69f62e4ee01ef85baa7899ef2a2058a1ddfa8f7d56ca015b7a57ae57cc
+size 4971473960

model-00002-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2bfb324f82fc39c5726321b3415ed4be3e942967f0cfbb50729cb5948ac78a6c
 size 4932952216

 version https://git-lfs.github.com/spec/v1
+oid sha256:da0766717029b20f96662a83365ebbca20b8f71b3a4886f21851d58620c023a6
 size 4932952216

model-00003-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:42f354c28221ee31e11f68d6981a3ad3a0d8d0264b2978957d01746233a30b29
 size 4995688160

 version https://git-lfs.github.com/spec/v1
+oid sha256:ffc1300216ae100b755f554627400b6e039bdcf06d889b9c5f7fe198445dc2ca
 size 4995688160

model-00004-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9ab80a04199bb5a17e5696a716037554e7d2a8ffc230e3863cd86dd29b587acc
 size 259328744

 version https://git-lfs.github.com/spec/v1
+oid sha256:c94104c4b476c29e652268cbe580828af39841c553d443c3984c0d4619b572ce
 size 259328744

model.safetensors.index.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "metadata": {
-    "total_size": 15159230664
   },
   "weight_map": {
     "grounding_encoder.sam2_model.image_encoder.neck.convs.0.conv.bias": "model-00004-of-00004.safetensors",
@@ -1338,7 +1338,6 @@
     "language_model.model.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
     "language_model.model.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
     "language_model.model.norm.weight": "model-00003-of-00004.safetensors",
-    "local_query": "model-00001-of-00004.safetensors",
     "mlp1.0.bias": "model-00003-of-00004.safetensors",
     "mlp1.0.weight": "model-00003-of-00004.safetensors",
     "mlp1.1.bias": "model-00003-of-00004.safetensors",

 {
   "metadata": {
+    "total_size": 15159214280
   },
   "weight_map": {
     "grounding_encoder.sam2_model.image_encoder.neck.convs.0.conv.bias": "model-00004-of-00004.safetensors",
     "language_model.model.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
     "language_model.model.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
     "language_model.model.norm.weight": "model-00003-of-00004.safetensors",
     "mlp1.0.bias": "model-00003-of-00004.safetensors",
     "mlp1.0.weight": "model-00003-of-00004.safetensors",
     "mlp1.1.bias": "model-00003-of-00004.safetensors",

modeling_earthmind_chat.py CHANGED Viewed

@@ -3,7 +3,7 @@
 # Copyright (c) 2024 OpenGVLab
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
 import warnings
 from typing import Any, List, Optional, Tuple, Union
@@ -113,7 +113,9 @@ class Sa2VAChatModel(PreTrainedModel):
         self.ps_version = config.ps_version
         self.llm_arch_name = config.llm_config.architectures[0]
-        self.local_query = nn.Parameter(torch.randn(2, 2048))
         use_flash_attn = use_flash_attn if has_flash_attn else False
         config.vision_config.use_flash_attn = True if use_flash_attn else False
@@ -334,7 +336,6 @@ class Sa2VAChatModel(PreTrainedModel):
         B, N, C = input_embeds.shape
         input_embeds = input_embeds.reshape(B * N, C)
         self._count += 1
@@ -550,6 +551,7 @@ class Sa2VAChatModel(PreTrainedModel):
     ) -> torch.LongTensor:
         device = self.device
         assert self.img_context_token_id is not None
         if pixel_values is not None:
             if visual_features is not None:
@@ -572,14 +574,60 @@ class Sa2VAChatModel(PreTrainedModel):
                 vit_embeds = self.extract_feature(pixel_values.to(device))
                 rgb_vit_embeds = self.extract_feature(rgb_pixel_values.to(device))
-                vit_embeds = torch.cat([vit_embeds, rgb_vit_embeds], dim=1)  # 10, 512, 2048
             image_flags = torch.sum(pixel_values, dim=(1, 2, 3)) != 0
             image_flags = image_flags.long()
             vit_embeds = vit_embeds[image_flags == 1]
-            input_embeds = self.language_model.get_input_embeddings()(input_ids.to(device))
             B, N, C = input_embeds.shape
             input_embeds = input_embeds.reshape(B * N, C)
@@ -648,7 +696,7 @@ class Sa2VAChatModel(PreTrainedModel):
         # print("generate",encode_outputs.hidden_states[-1][0].shape)
         encode_feature=encode_outputs.hidden_states[-1][0]
-        return outputs,encode_feature,encode_outputs.attentions
     def preparing_for_generation(self, tokenizer, max_new_tokens=2048, torch_dtype=torch.bfloat16):
         # set stop criteria and generation configs for model
@@ -784,7 +832,7 @@ class Sa2VAChatModel(PreTrainedModel):
                     self.grounding_encoder.preprocess_image(pixel) for pixel in extra_pixel_values
                 ]).to(self.torch_dtype)
-                images = dynamic_preprocess(image, self.min_dynamic_patch,
                                             self.max_dynamic_patch,
                                             self.image_size, self.use_thumbnail)
@@ -899,11 +947,6 @@ class Sa2VAChatModel(PreTrainedModel):
     def predict_forward_multi(
             self,
             image=None,
@@ -966,7 +1009,7 @@ class Sa2VAChatModel(PreTrainedModel):
                 input_dict['vp_overall_mask'] = None
             else:
                 ori_image_size = image.size
                 # prepare grounding images
                 g_image = np.array(image)  # for grounding
                 g_image = self.extra_image_processor.apply_image(g_image)
@@ -976,14 +1019,11 @@ class Sa2VAChatModel(PreTrainedModel):
                     self.grounding_encoder.preprocess_image(pixel) for pixel in extra_pixel_values
                 ]).to(self.torch_dtype)
-                images = dynamic_preprocess(image, self.min_dynamic_patch,
                                             self.max_dynamic_patch,
                                             self.image_size, self.use_thumbnail)
-                rgb_images = dynamic_preprocess(rgb_image, self.min_dynamic_patch,
-                                            self.max_dynamic_patch,
-                                            self.image_size, self.use_thumbnail)
@@ -996,11 +1036,34 @@ class Sa2VAChatModel(PreTrainedModel):
                 pixel_values = [self.transformer(image) for image in images]
                 pixel_values = torch.stack(pixel_values).to(self.torch_dtype)
-                rgb_pixel_values = [self.transformer(image) for image in rgb_images]
-                rgb_pixel_values = torch.stack(rgb_pixel_values).to(self.torch_dtype)
-                num_image_tokens = pixel_values.shape[0] * self.patch_token *2
                 num_frames = 1
             input_dict['g_pixel_values'] = g_pixel_values
             input_dict['pixel_values'] = pixel_values
@@ -1067,7 +1130,7 @@ class Sa2VAChatModel(PreTrainedModel):
                 'vp_overall_mask': input_dict['vp_overall_mask'],
             }
-        generate_output,encode_feature,encode_attention = self.generate_multi(
             **mm_inputs,
             generation_config=self.gen_config,
             streamer=None,
@@ -1109,7 +1172,7 @@ class Sa2VAChatModel(PreTrainedModel):
             masks = masks.cpu().numpy()
             ret_masks.append(masks)
-        return {'prediction': predict, 'prediction_masks': ret_masks,}
 def get_seg_hidden_states(hidden_states, output_ids, seg_id):
     seg_mask = output_ids == seg_id
@@ -1159,6 +1222,11 @@ def dynamic_preprocess(image,
     target_height = image_size * target_aspect_ratio[1]
     blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
     # resize the image
     resized_img = image.resize((target_width, target_height))
     processed_images = []
@@ -1174,7 +1242,7 @@ def dynamic_preprocess(image,
     if use_thumbnail and len(processed_images) != 1:
         thumbnail_img = image.resize((image_size, image_size))
         processed_images.append(thumbnail_img)
-    return processed_images
 from transformers.cache_utils import Cache, DynamicCache

 # Copyright (c) 2024 OpenGVLab
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
+from math import sqrt
 import warnings
 from typing import Any, List, Optional, Tuple, Union
         self.ps_version = config.ps_version
         self.llm_arch_name = config.llm_config.architectures[0]
+        self.hca_tau = 1.0
         use_flash_attn = use_flash_attn if has_flash_attn else False
         config.vision_config.use_flash_attn = True if use_flash_attn else False
         B, N, C = input_embeds.shape
         input_embeds = input_embeds.reshape(B * N, C)
         self._count += 1
     ) -> torch.LongTensor:
         device = self.device
         assert self.img_context_token_id is not None
+        input_embeds = self.language_model.get_input_embeddings()(input_ids.to(device))
         if pixel_values is not None:
             if visual_features is not None:
                 vit_embeds = self.extract_feature(pixel_values.to(device))
                 rgb_vit_embeds = self.extract_feature(rgb_pixel_values.to(device))
+                print("extract_featrues",rgb_vit_embeds.shape,vit_embeds.shape)
+                if rgb_vit_embeds.shape[0] != vit_embeds.shape[0]:
+                    # 对batch维度进行平均池化，保持后两个维度不变
+                    rgb_vit_embeds = rgb_vit_embeds.mean(dim=0, keepdim=True)
+                    print("after avgpooling:", rgb_vit_embeds.shape)
+                X_sar=vit_embeds
+                X_rgb=rgb_vit_embeds
+                tau = 0.2  # 温度，拉开分布
+                tau_txt=0.2
+                D = X_rgb.size(-1)
+                # 互注意力
+                A_rs = torch.matmul(X_rgb, X_sar.transpose(-2, -1)) / (sqrt(D) * tau)  # (B,N,N)
+                A_rs = F.softmax(A_rs, dim=-1)  # RGB->SAR
+                A_sr = torch.matmul(X_sar, X_rgb.transpose(-2, -1)) / (sqrt(D) * tau)  # (B,N,N)
+                A_sr = F.softmax(A_sr, dim=-1)  # SAR->RGB
+                # 对角线 = 对应位置的互吸引力
+                r_sar = torch.diagonal(A_rs, dim1=-2, dim2=-1)  # (B, N)  RGB认为对应SAR位置的可靠度
+                r_rgb = torch.diagonal(A_sr, dim1=-2, dim2=-1)  # (B, N)  SAR认为对应RGB位置的可靠度
+                ######################  add visual-text cross attention
+                t = input_embeds.mean(dim=1)
+                beta_rgb = torch.matmul(X_rgb, t.unsqueeze(-1)).squeeze(-1) / tau_txt   # (B, N)
+                beta_sar = torch.matmul(X_sar, t.unsqueeze(-1)).squeeze(-1) / tau_txt   # (B, N)
+                # 先把两路都变成概率，再在 logit 空间做线性插值更稳
+                eps = 1e-6
+                vis_pair  = torch.stack([r_rgb,  r_sar],  dim=-1)             # (B, N, 2)
+                txt_pair  = torch.stack([beta_rgb, beta_sar], dim=-1)         # (B, N, 2)
+                logits = 0.5 * vis_pair + 0.5 * txt_pair   # (B, N, 2)
+                alpha = F.softmax(logits, dim=-1)                       # (B, N, 2)
+                alpha_rgb = alpha[..., 0].unsqueeze(-1)                       # (B, N, 1)
+                alpha_sar = alpha[..., 1].unsqueeze(-1)                       # (B, N, 1)
+                # 按位置加权融合
+                Z = alpha_rgb * X_rgb + alpha_sar * X_sar           # (B,N,D)
+                vit_embeds = Z
             image_flags = torch.sum(pixel_values, dim=(1, 2, 3)) != 0
             image_flags = image_flags.long()
             vit_embeds = vit_embeds[image_flags == 1]
             B, N, C = input_embeds.shape
             input_embeds = input_embeds.reshape(B * N, C)
         # print("generate",encode_outputs.hidden_states[-1][0].shape)
         encode_feature=encode_outputs.hidden_states[-1][0]
+        return outputs,encode_feature,encode_outputs.attentions,(alpha_rgb,alpha_sar)
     def preparing_for_generation(self, tokenizer, max_new_tokens=2048, torch_dtype=torch.bfloat16):
         # set stop criteria and generation configs for model
                     self.grounding_encoder.preprocess_image(pixel) for pixel in extra_pixel_values
                 ]).to(self.torch_dtype)
+                images,weight = dynamic_preprocess(image, self.min_dynamic_patch,
                                             self.max_dynamic_patch,
                                             self.image_size, self.use_thumbnail)
     def predict_forward_multi(
             self,
             image=None,
                 input_dict['vp_overall_mask'] = None
             else:
                 ori_image_size = image.size
                 # prepare grounding images
                 g_image = np.array(image)  # for grounding
                 g_image = self.extra_image_processor.apply_image(g_image)
                     self.grounding_encoder.preprocess_image(pixel) for pixel in extra_pixel_values
                 ]).to(self.torch_dtype)
+                images,sta = dynamic_preprocess(image, self.min_dynamic_patch,
                                             self.max_dynamic_patch,
                                             self.image_size, self.use_thumbnail)
                 pixel_values = [self.transformer(image) for image in images]
                 pixel_values = torch.stack(pixel_values).to(self.torch_dtype)
+                if type(rgb_image) is list:
+                    pixel_values_list = []
+                    for img_rgb_like in rgb_image:
+                        sub_images,_ = dynamic_preprocess(
+                            img_rgb_like,
+                            self.min_dynamic_patch,
+                            self.max_dynamic_patch,
+                            self.image_size,
+                            self.use_thumbnail
+                        )
+                        pixel_values_list.extend([self.transformer(si) for si in sub_images])
+                    rgb_pixel_values = torch.stack(pixel_values_list).to(self.torch_dtype)  # 形状：[M_total, 3, 44
+                else:
+                    rgb_images,sta = dynamic_preprocess(rgb_image, self.min_dynamic_patch,
+                                                self.max_dynamic_patch,
+                                                self.image_size, self.use_thumbnail)
+                    rgb_pixel_values = [self.transformer(image) for image in rgb_images]
+                    rgb_pixel_values = torch.stack(rgb_pixel_values).to(self.torch_dtype)
+                print("input",rgb_pixel_values.shape,pixel_values.shape)
+                num_image_tokens = pixel_values.shape[0] * self.patch_token
                 num_frames = 1
             input_dict['g_pixel_values'] = g_pixel_values
             input_dict['pixel_values'] = pixel_values
                 'vp_overall_mask': input_dict['vp_overall_mask'],
             }
+        generate_output,encode_feature,encode_attention,vis_weight = self.generate_multi(
             **mm_inputs,
             generation_config=self.gen_config,
             streamer=None,
             masks = masks.cpu().numpy()
             ret_masks.append(masks)
+        return {'prediction': predict, 'prediction_masks': ret_masks,'vis_weight':vis_weight,"sta":sta}
 def get_seg_hidden_states(hidden_states, output_ids, seg_id):
     seg_mask = output_ids == seg_id
     target_height = image_size * target_aspect_ratio[1]
     blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    cols=target_aspect_ratio[0]
+    rows=target_aspect_ratio[1]
     # resize the image
     resized_img = image.resize((target_width, target_height))
     processed_images = []
     if use_thumbnail and len(processed_images) != 1:
         thumbnail_img = image.resize((image_size, image_size))
         processed_images.append(thumbnail_img)
+    return processed_images,(cols,rows)
 from transformers.cache_utils import Cache, DynamicCache