jiang-cc
/

AD-Copilot-Thinking

@@ -116,6 +116,7 @@ class YangJianProcessor(Qwen2_5_VLProcessor):
             for i in range(len(text)):
                 while self.image_token in text[i]:
                     num_image_tokens = image_grid_thw[index].prod() // merge_length
                     text[i] = text[i].replace(self.image_token, "<|placeholder|>" * (num_image_tokens + self.compare_token_size), 1)
                     index += 1
                 text[i] = text[i].replace("<|placeholder|>", self.image_token)
@@ -486,7 +487,7 @@ class YangJianVisionTransformerPretrainedModel(Qwen2_5_VisionTransformerPretrain
     def __init__(self, config, *inputs, **kwargs) -> None:
         super().__init__(config, *inputs, **kwargs)
         self.compare_visual_encoder = YangJianCompareVisualEncoder(config)
     def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs) -> torch.Tensor:
         """
         Args:
@@ -570,6 +571,7 @@ class YangJianVLModel(Qwen2_5_VLModel):
     def __init__(self, config):
         super().__init__(config)
         self.visual = YangJianVisionTransformerPretrainedModel._from_config(config.vision_config)
         # self.learnable_image_embeddings = nn.Parameter(
         #     torch.randn(100, config.hidden_size) * 0.02  # 使用小的初始化值
         # )
@@ -644,19 +646,11 @@ class YangJianVLModel(Qwen2_5_VLModel):
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
             if pixel_values is not None:
                 image_embeds = self.get_image_features(pixel_values, image_grid_thw)
-                # # 为每个图像添加 100 个可学习的 embedding
-                # learnable_embeddings = self.learnable_image_embeddings.to(image_embeds[0].device, image_embeds[0].dtype)
-                # enhanced_image_embeds = []
-                # for i, embeds in enumerate(image_embeds):
-                #     # 为每个图像添加 100 个可学习的 embedding
-                #     enhanced_embeds = torch.cat([embeds, learnable_embeddings], dim=0)
-                #     enhanced_image_embeds.append(enhanced_embeds)
                 image_embeds = torch.cat(image_embeds, dim=0)
                 n_image_tokens = (input_ids == self.config.image_token_id).sum()
                 n_image_features = image_embeds.shape[0]
@@ -713,7 +707,7 @@ class YangJianVLModel(Qwen2_5_VLModel):
                 or (past_key_values is None or past_key_values.get_seq_length() == 0)
             )
             if (prefill_compiled_stage or prefill_noncompiled_stage) or self.rope_deltas is None:
-                position_ids, rope_deltas = self.get_rope_index(
                     input_ids,
                     image_grid_thw,
                     video_grid_thw,
@@ -758,6 +752,142 @@ class YangJianVLModel(Qwen2_5_VLModel):
             rope_deltas=self.rope_deltas,
         )
         return output if return_dict else output.to_tuple()
 class YangJianVLForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
     config_class = YangJianConfig
@@ -765,3 +895,48 @@ class YangJianVLForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
     def __init__(self, config):
         super().__init__(config)
         self.model = YangJianVLModel(config)

             for i in range(len(text)):
                 while self.image_token in text[i]:
                     num_image_tokens = image_grid_thw[index].prod() // merge_length
+                    # text[i] = text[i].replace(self.image_token, "<|placeholder|>" * (num_image_tokens), 1)
                     text[i] = text[i].replace(self.image_token, "<|placeholder|>" * (num_image_tokens + self.compare_token_size), 1)
                     index += 1
                 text[i] = text[i].replace("<|placeholder|>", self.image_token)
     def __init__(self, config, *inputs, **kwargs) -> None:
         super().__init__(config, *inputs, **kwargs)
         self.compare_visual_encoder = YangJianCompareVisualEncoder(config)
     def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs) -> torch.Tensor:
         """
         Args:
     def __init__(self, config):
         super().__init__(config)
         self.visual = YangJianVisionTransformerPretrainedModel._from_config(config.vision_config)
+        self.compare_token_size = config.vision_config.compare_token_size
         # self.learnable_image_embeddings = nn.Parameter(
         #     torch.randn(100, config.hidden_size) * 0.02  # 使用小的初始化值
         # )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
             if pixel_values is not None:
                 image_embeds = self.get_image_features(pixel_values, image_grid_thw)
                 image_embeds = torch.cat(image_embeds, dim=0)
                 n_image_tokens = (input_ids == self.config.image_token_id).sum()
                 n_image_features = image_embeds.shape[0]
                 or (past_key_values is None or past_key_values.get_seq_length() == 0)
             )
             if (prefill_compiled_stage or prefill_noncompiled_stage) or self.rope_deltas is None:
+                position_ids, rope_deltas = self.get_rope_index_with_compare_token(
                     input_ids,
                     image_grid_thw,
                     video_grid_thw,
             rope_deltas=self.rope_deltas,
         )
         return output if return_dict else output.to_tuple()
+    def get_rope_index_with_compare_token(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        second_per_grid_ts: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        spatial_merge_size = self.config.vision_config.spatial_merge_size
+        image_token_id = self.config.image_token_id
+        video_token_id = self.config.video_token_id
+        vision_start_token_id = self.config.vision_start_token_id
+        mrope_position_deltas = []
+        if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
+            total_input_ids = input_ids
+            if attention_mask is None:
+                attention_mask = torch.ones_like(total_input_ids)
+            position_ids = torch.ones(
+                3,
+                input_ids.shape[0],
+                input_ids.shape[1],
+                dtype=input_ids.dtype,
+                device=input_ids.device,
+            )
+            image_index, video_index = 0, 0
+            attention_mask = attention_mask.to(total_input_ids.device)
+            for i, input_ids in enumerate(total_input_ids):
+                input_ids = input_ids[attention_mask[i] == 1]
+                image_nums, video_nums = 0, 0
+                vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1)
+                vision_tokens = input_ids[vision_start_indices + 1]
+                image_nums = (vision_tokens == image_token_id).sum()
+                video_nums = (vision_tokens == video_token_id).sum()
+                input_tokens = input_ids.tolist()
+                llm_pos_ids_list: list = []
+                st = 0
+                remain_images, remain_videos = image_nums, video_nums
+                for vision_index in range(image_nums + video_nums):
+                    if image_token_id in input_tokens and remain_images > 0:
+                        ed_image = input_tokens.index(image_token_id, st)
+                    else:
+                        ed_image = len(input_tokens) + 1
+                    if video_token_id in input_tokens and remain_videos > 0:
+                        ed_video = input_tokens.index(video_token_id, st)
+                    else:
+                        ed_video = len(input_tokens) + 1
+                    if ed_image < ed_video:
+                        t, h, w = (
+                            image_grid_thw[image_index][0],
+                            image_grid_thw[image_index][1],
+                            image_grid_thw[image_index][2],
+                        )
+                        second_per_grid_t = 0
+                        image_index += 1
+                        remain_images -= 1
+                        ed = ed_image
+                    else:
+                        t, h, w = (
+                            video_grid_thw[video_index][0],
+                            video_grid_thw[video_index][1],
+                            video_grid_thw[video_index][2],
+                        )
+                        if second_per_grid_ts is not None:
+                            second_per_grid_t = second_per_grid_ts[video_index]
+                        else:
+                            second_per_grid_t = 1.0
+                        video_index += 1
+                        remain_videos -= 1
+                        ed = ed_video
+                    llm_grid_t, llm_grid_h, llm_grid_w = (
+                        t.item(),
+                        h.item() // spatial_merge_size,
+                        w.item() // spatial_merge_size,
+                    )
+                    text_len = ed - st
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                    llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+                    range_tensor = torch.arange(llm_grid_t).view(-1, 1)
+                    expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w)
+                    ## normalize type, send to device.
+                    second_per_grid_t = torch.as_tensor(
+                        second_per_grid_t, dtype=range_tensor.dtype, device=range_tensor.device
+                    )
+                    time_tensor = expanded_range * second_per_grid_t * self.config.vision_config.tokens_per_second
+                    time_tensor_long = time_tensor.long()
+                    t_index = time_tensor_long.flatten()
+                    h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
+                    w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
+                    llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+                    st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+                    if ed_image < ed_video:
+                        # 如果当前是图片,则需要插入 compare_token_size 个图像对比的token的position
+                        compare_t_index = t_index[-1].repeat(self.compare_token_size)
+                        compare_h_index = torch.arange(self.compare_token_size)
+                        compare_w_index = torch.arange(self.compare_token_size)
+                        llm_pos_ids_list.append(torch.stack([compare_t_index, compare_h_index, compare_w_index]) + text_len + st_idx)
+                        st = st + self.compare_token_size
+                if st < len(input_tokens):
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                    text_len = len(input_tokens) - st
+                    llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+                llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+                position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
+                mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
+            mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
+            return position_ids, mrope_position_deltas
+        else:
+            if attention_mask is not None:
+                position_ids = attention_mask.long().cumsum(-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
+                max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
+                mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
+            else:
+                position_ids = (
+                    torch.arange(input_ids.shape[1], device=input_ids.device)
+                    .view(1, 1, -1)
+                    .expand(3, input_ids.shape[0], -1)
+                )
+                mrope_position_deltas = torch.zeros(
+                    [input_ids.shape[0], 1],
+                    device=input_ids.device,
+                    dtype=input_ids.dtype,
+                )
+            return position_ids, mrope_position_deltas
 class YangJianVLForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
     config_class = YangJianConfig
     def __init__(self, config):
         super().__init__(config)
         self.model = YangJianVLModel(config)
+    # def _prepare_generation_config(self, generation_config, use_model_defaults, **kwargs: dict):
+    #     model_kwargs = super()._prepare_generation_config(generation_config, use_model_defaults, **kwargs)
+    #     compare_token_size = self.config.vision_config.compare_token_size
+    #     input_dict = model_kwargs[1]
+    #     input_ids = model_kwargs[1]["input_ids"]
+    #     attention_mask = model_kwargs[1]["attention_mask"]
+    #     if "pixel_values" in input_dict and input_dict["pixel_values"] is not None:
+    #         image_grid_thw = input_dict["image_grid_thw"]
+    #         # 计算每张图片的token数量
+    #         image_token_counts = (image_grid_thw.prod(-1) // self.config.vision_config.spatial_merge_size**2).tolist()
+    #         image_token_positions = (input_ids == self.config.image_token_id).nonzero(as_tuple=True)[1]
+    #         # 倒序遍历图片,这样插入时不会影响前面图片的位置
+    #         current_end = len(image_token_positions)  # 最后一个图片token的结束位置
+    #         for i in range(len(image_token_counts) - 1, -1, -1):
+    #             count = image_token_counts[i]
+    #             # 计算当前图片的结束位置
+    #             start = current_end - count  # 当前图片的起始位置
+    #             end_index = image_token_positions[current_end - 1]  # 当前图片的最后一个token位置
+    #             # 在第i张图片的末尾插入 self.compare_token_size 个图像对比的token
+    #             # 获取插入位置的token的值
+    #             prev_token = input_ids[:, end_index]
+    #             input_ids = torch.cat([
+    #                 input_ids[:, :end_index + 1],
+    #                 prev_token.repeat(input_ids.shape[0], compare_token_size),
+    #                 input_ids[:, end_index + 1:]
+    #             ], dim=1)
+    #             # 同步更新attention_mask和position_ids
+    #             if attention_mask is not None:
+    #                 prev_mask = attention_mask[:, end_index]
+    #                 attention_mask = torch.cat([
+    #                     attention_mask[:, :end_index + 1],
+    #                     prev_mask.repeat(input_ids.shape[0], compare_token_size),
+    #                     attention_mask[:, end_index + 1:]
+    #                 ], dim=1)
+    #             current_end = start  # 更新结束位置为当前图片的起始位置
+    #     model_kwargs[1]["input_ids"] = input_ids
+    #     model_kwargs[1]["attention_mask"] = attention_mask
+    #     return model_kwargs