jiang-cc commited on
Commit
56a56de
·
verified ·
1 Parent(s): b281ecf

Upload processor

Browse files
Files changed (2) hide show
  1. modeling_yangjian.py +2 -152
  2. tokenizer_config.json +0 -4
modeling_yangjian.py CHANGED
@@ -8,11 +8,8 @@ from transformers import Qwen2_5_VLForConditionalGeneration
8
  from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
9
  Qwen2_5_VisionTransformerPretrainedModel,
10
  Qwen2_5_VLModel,
11
- Qwen2_5_VLModelOutputWithPast,
12
- is_torchdynamo_compiling,
13
  Qwen2RMSNorm,
14
  Qwen2_5_VLMLP,
15
- eager_attention_forward,
16
  ALL_ATTENTION_FUNCTIONS
17
  )
18
  from transformers.image_utils import ImageInput
@@ -618,155 +615,8 @@ class YangJianVLModel(Qwen2_5_VLModel):
618
  # image_embeds = torch.cat(enhanced_image_embeds, dim=0)
619
  return enhanced_image_embeds
620
 
621
-
622
- def forward(
623
- self,
624
- input_ids: torch.LongTensor = None,
625
- attention_mask: Optional[torch.Tensor] = None,
626
- position_ids: Optional[torch.LongTensor] = None,
627
- past_key_values: Optional[list[torch.FloatTensor]] = None,
628
- inputs_embeds: Optional[torch.FloatTensor] = None,
629
- use_cache: Optional[bool] = None,
630
- output_attentions: Optional[bool] = None,
631
- output_hidden_states: Optional[bool] = None,
632
- return_dict: Optional[bool] = None,
633
- pixel_values: Optional[torch.Tensor] = None,
634
- pixel_values_videos: Optional[torch.FloatTensor] = None,
635
- image_grid_thw: Optional[torch.LongTensor] = None,
636
- video_grid_thw: Optional[torch.LongTensor] = None,
637
- rope_deltas: Optional[torch.LongTensor] = None,
638
- cache_position: Optional[torch.LongTensor] = None,
639
- second_per_grid_ts: Optional[torch.Tensor] = None,
640
- **kwargs,
641
- ) -> Union[tuple, Qwen2_5_VLModelOutputWithPast]:
642
- r"""
643
- pixel_values_videos (`torch.FloatTensor` of shape `(seq_length, num_channels * temporal_size * image_size * image_size)):
644
- The tensors corresponding to the input videos. Pixel values can be obtained using
645
- [`AutoImageProcessor`]. See [`Qwen2VLImageProcessor.__call__`] for details. [`Qwen2_5_VLProcessor`] uses
646
- [`Qwen2VLImageProcessor`] for processing videos.
647
- image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
648
- The temporal, height and width of feature shape of each image in LLM.
649
- video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
650
- The temporal, height and width of feature shape of each video in LLM.
651
- rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
652
- The rope index difference between sequence length and multimodal rope.
653
- second_per_grid_ts (`torch.Tensor` of shape `(num_videos)`, *optional*):
654
- The time interval (in seconds) for each grid along the temporal dimension in the 3D position IDs.
655
- """
656
-
657
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
658
- output_hidden_states = (
659
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
660
- )
661
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
662
-
663
- if inputs_embeds is None:
664
-
665
- inputs_embeds = self.get_input_embeddings()(input_ids)
666
- if pixel_values is not None:
667
- image_embeds = self.get_image_features(pixel_values, image_grid_thw)
668
-
669
- image_embeds = torch.cat(image_embeds, dim=0)
670
- n_image_tokens = (input_ids == self.config.image_token_id).sum()
671
- n_image_features = image_embeds.shape[0]
672
- if not is_torchdynamo_compiling() and n_image_tokens != n_image_features:
673
- raise ValueError(
674
- f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
675
- )
676
-
677
- mask = input_ids == self.config.image_token_id
678
- mask_unsqueezed = mask.unsqueeze(-1)
679
- mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
680
- image_mask = mask_expanded.to(inputs_embeds.device)
681
-
682
- image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
683
- inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
684
-
685
- if pixel_values_videos is not None:
686
- video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw)
687
- video_embeds = torch.cat(video_embeds, dim=0)
688
- n_video_tokens = (input_ids == self.config.video_token_id).sum()
689
- n_video_features = video_embeds.shape[0]
690
- if not is_torchdynamo_compiling() and n_video_tokens != n_video_features:
691
- raise ValueError(
692
- f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
693
- )
694
-
695
- mask = input_ids == self.config.video_token_id
696
- mask_unsqueezed = mask.unsqueeze(-1)
697
- mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
698
- video_mask = mask_expanded.to(inputs_embeds.device)
699
-
700
- video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
701
- inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
702
-
703
- if position_ids is None:
704
- attention_mask_tensor = (
705
- attention_mask if not isinstance(attention_mask, dict) else attention_mask["full_attention"]
706
- )
707
- if attention_mask_tensor is not None and attention_mask_tensor.ndim == 4:
708
- attention_mask_tensor = torch.diagonal(attention_mask_tensor[:, 0], dim1=1, dim2=2)
709
- attention_mask_tensor = attention_mask_tensor / torch.finfo(attention_mask_tensor.dtype).min
710
- attention_mask_tensor = (1.0 - attention_mask_tensor).int()
711
-
712
- # Calculate RoPE index once per generation in the pre-fill stage only.
713
- # When compiling, we can't check tensor values thus we check only input length
714
- # It is safe to assume that `length!=1` means we're in pre-fill because compiled
715
- # models currently cannot do asssisted decoding
716
- prefill_compiled_stage = is_torchdynamo_compiling() and (
717
- (input_ids is not None and input_ids.shape[1] != 1)
718
- or (inputs_embeds is not None and inputs_embeds.shape[1] != 1)
719
- )
720
- prefill_noncompiled_stage = not is_torchdynamo_compiling() and (
721
- (cache_position is not None and cache_position[0] == 0)
722
- or (past_key_values is None or past_key_values.get_seq_length() == 0)
723
- )
724
- if (prefill_compiled_stage or prefill_noncompiled_stage) or self.rope_deltas is None:
725
- position_ids, rope_deltas = self.get_rope_index_with_compare_token(
726
- input_ids,
727
- image_grid_thw,
728
- video_grid_thw,
729
- second_per_grid_ts=second_per_grid_ts,
730
- attention_mask=attention_mask_tensor,
731
- )
732
- self.rope_deltas = rope_deltas
733
- # then use the prev pre-calculated rope-deltas to get the correct position ids
734
- else:
735
- batch_size, seq_length, _ = inputs_embeds.shape
736
- delta = (
737
- (cache_position[0] + self.rope_deltas).to(inputs_embeds.device)
738
- if cache_position is not None
739
- else 0
740
- )
741
- position_ids = torch.arange(seq_length, device=inputs_embeds.device)
742
- position_ids = position_ids.view(1, -1).expand(batch_size, -1)
743
- if cache_position is not None: # otherwise `deltas` is an int `0`
744
- delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
745
- position_ids = position_ids.add(delta)
746
- position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
747
-
748
- outputs = self.language_model(
749
- input_ids=None,
750
- position_ids=position_ids,
751
- attention_mask=attention_mask,
752
- past_key_values=past_key_values,
753
- inputs_embeds=inputs_embeds,
754
- use_cache=use_cache,
755
- output_attentions=output_attentions,
756
- output_hidden_states=output_hidden_states,
757
- return_dict=True,
758
- cache_position=cache_position,
759
- **kwargs,
760
- )
761
-
762
- output = Qwen2_5_VLModelOutputWithPast(
763
- last_hidden_state=outputs.last_hidden_state,
764
- past_key_values=outputs.past_key_values,
765
- hidden_states=outputs.hidden_states,
766
- attentions=outputs.attentions,
767
- rope_deltas=self.rope_deltas,
768
- )
769
- return output if return_dict else output.to_tuple()
770
 
771
  def get_rope_index_with_compare_token(
772
  self,
 
8
  from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
9
  Qwen2_5_VisionTransformerPretrainedModel,
10
  Qwen2_5_VLModel,
 
 
11
  Qwen2RMSNorm,
12
  Qwen2_5_VLMLP,
 
13
  ALL_ATTENTION_FUNCTIONS
14
  )
15
  from transformers.image_utils import ImageInput
 
615
  # image_embeds = torch.cat(enhanced_image_embeds, dim=0)
616
  return enhanced_image_embeds
617
 
618
+ def get_rope_index(self, input_ids: Optional[torch.LongTensor] = None, image_grid_thw: Optional[torch.LongTensor] = None, video_grid_thw: Optional[torch.LongTensor] = None, second_per_grid_ts: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None) -> tuple[torch.Tensor, torch.Tensor]:
619
+ return self.get_rope_index_with_compare_token(input_ids, image_grid_thw, video_grid_thw, second_per_grid_ts, attention_mask)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
620
 
621
  def get_rope_index_with_compare_token(
622
  self,
tokenizer_config.json CHANGED
@@ -202,12 +202,8 @@
202
  "eos_token": "<|im_end|>",
203
  "errors": "replace",
204
  "extra_special_tokens": {},
205
- "max_length": null,
206
  "model_max_length": 131072,
207
- "pad_to_multiple_of": null,
208
  "pad_token": "<|endoftext|>",
209
- "pad_token_type_id": 0,
210
- "padding_side": "right",
211
  "processor_class": "YangJianProcessor",
212
  "split_special_tokens": false,
213
  "tokenizer_class": "Qwen2Tokenizer",
 
202
  "eos_token": "<|im_end|>",
203
  "errors": "replace",
204
  "extra_special_tokens": {},
 
205
  "model_max_length": 131072,
 
206
  "pad_token": "<|endoftext|>",
 
 
207
  "processor_class": "YangJianProcessor",
208
  "split_special_tokens": false,
209
  "tokenizer_class": "Qwen2Tokenizer",