Upload processor
Browse files- modeling_yangjian.py +2 -152
- tokenizer_config.json +0 -4
modeling_yangjian.py
CHANGED
|
@@ -8,11 +8,8 @@ from transformers import Qwen2_5_VLForConditionalGeneration
|
|
| 8 |
from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
|
| 9 |
Qwen2_5_VisionTransformerPretrainedModel,
|
| 10 |
Qwen2_5_VLModel,
|
| 11 |
-
Qwen2_5_VLModelOutputWithPast,
|
| 12 |
-
is_torchdynamo_compiling,
|
| 13 |
Qwen2RMSNorm,
|
| 14 |
Qwen2_5_VLMLP,
|
| 15 |
-
eager_attention_forward,
|
| 16 |
ALL_ATTENTION_FUNCTIONS
|
| 17 |
)
|
| 18 |
from transformers.image_utils import ImageInput
|
|
@@ -618,155 +615,8 @@ class YangJianVLModel(Qwen2_5_VLModel):
|
|
| 618 |
# image_embeds = torch.cat(enhanced_image_embeds, dim=0)
|
| 619 |
return enhanced_image_embeds
|
| 620 |
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
self,
|
| 624 |
-
input_ids: torch.LongTensor = None,
|
| 625 |
-
attention_mask: Optional[torch.Tensor] = None,
|
| 626 |
-
position_ids: Optional[torch.LongTensor] = None,
|
| 627 |
-
past_key_values: Optional[list[torch.FloatTensor]] = None,
|
| 628 |
-
inputs_embeds: Optional[torch.FloatTensor] = None,
|
| 629 |
-
use_cache: Optional[bool] = None,
|
| 630 |
-
output_attentions: Optional[bool] = None,
|
| 631 |
-
output_hidden_states: Optional[bool] = None,
|
| 632 |
-
return_dict: Optional[bool] = None,
|
| 633 |
-
pixel_values: Optional[torch.Tensor] = None,
|
| 634 |
-
pixel_values_videos: Optional[torch.FloatTensor] = None,
|
| 635 |
-
image_grid_thw: Optional[torch.LongTensor] = None,
|
| 636 |
-
video_grid_thw: Optional[torch.LongTensor] = None,
|
| 637 |
-
rope_deltas: Optional[torch.LongTensor] = None,
|
| 638 |
-
cache_position: Optional[torch.LongTensor] = None,
|
| 639 |
-
second_per_grid_ts: Optional[torch.Tensor] = None,
|
| 640 |
-
**kwargs,
|
| 641 |
-
) -> Union[tuple, Qwen2_5_VLModelOutputWithPast]:
|
| 642 |
-
r"""
|
| 643 |
-
pixel_values_videos (`torch.FloatTensor` of shape `(seq_length, num_channels * temporal_size * image_size * image_size)):
|
| 644 |
-
The tensors corresponding to the input videos. Pixel values can be obtained using
|
| 645 |
-
[`AutoImageProcessor`]. See [`Qwen2VLImageProcessor.__call__`] for details. [`Qwen2_5_VLProcessor`] uses
|
| 646 |
-
[`Qwen2VLImageProcessor`] for processing videos.
|
| 647 |
-
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
|
| 648 |
-
The temporal, height and width of feature shape of each image in LLM.
|
| 649 |
-
video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
|
| 650 |
-
The temporal, height and width of feature shape of each video in LLM.
|
| 651 |
-
rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
|
| 652 |
-
The rope index difference between sequence length and multimodal rope.
|
| 653 |
-
second_per_grid_ts (`torch.Tensor` of shape `(num_videos)`, *optional*):
|
| 654 |
-
The time interval (in seconds) for each grid along the temporal dimension in the 3D position IDs.
|
| 655 |
-
"""
|
| 656 |
-
|
| 657 |
-
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
| 658 |
-
output_hidden_states = (
|
| 659 |
-
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
| 660 |
-
)
|
| 661 |
-
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 662 |
-
|
| 663 |
-
if inputs_embeds is None:
|
| 664 |
-
|
| 665 |
-
inputs_embeds = self.get_input_embeddings()(input_ids)
|
| 666 |
-
if pixel_values is not None:
|
| 667 |
-
image_embeds = self.get_image_features(pixel_values, image_grid_thw)
|
| 668 |
-
|
| 669 |
-
image_embeds = torch.cat(image_embeds, dim=0)
|
| 670 |
-
n_image_tokens = (input_ids == self.config.image_token_id).sum()
|
| 671 |
-
n_image_features = image_embeds.shape[0]
|
| 672 |
-
if not is_torchdynamo_compiling() and n_image_tokens != n_image_features:
|
| 673 |
-
raise ValueError(
|
| 674 |
-
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
|
| 675 |
-
)
|
| 676 |
-
|
| 677 |
-
mask = input_ids == self.config.image_token_id
|
| 678 |
-
mask_unsqueezed = mask.unsqueeze(-1)
|
| 679 |
-
mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
|
| 680 |
-
image_mask = mask_expanded.to(inputs_embeds.device)
|
| 681 |
-
|
| 682 |
-
image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
|
| 683 |
-
inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
|
| 684 |
-
|
| 685 |
-
if pixel_values_videos is not None:
|
| 686 |
-
video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw)
|
| 687 |
-
video_embeds = torch.cat(video_embeds, dim=0)
|
| 688 |
-
n_video_tokens = (input_ids == self.config.video_token_id).sum()
|
| 689 |
-
n_video_features = video_embeds.shape[0]
|
| 690 |
-
if not is_torchdynamo_compiling() and n_video_tokens != n_video_features:
|
| 691 |
-
raise ValueError(
|
| 692 |
-
f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
|
| 693 |
-
)
|
| 694 |
-
|
| 695 |
-
mask = input_ids == self.config.video_token_id
|
| 696 |
-
mask_unsqueezed = mask.unsqueeze(-1)
|
| 697 |
-
mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
|
| 698 |
-
video_mask = mask_expanded.to(inputs_embeds.device)
|
| 699 |
-
|
| 700 |
-
video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
|
| 701 |
-
inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
|
| 702 |
-
|
| 703 |
-
if position_ids is None:
|
| 704 |
-
attention_mask_tensor = (
|
| 705 |
-
attention_mask if not isinstance(attention_mask, dict) else attention_mask["full_attention"]
|
| 706 |
-
)
|
| 707 |
-
if attention_mask_tensor is not None and attention_mask_tensor.ndim == 4:
|
| 708 |
-
attention_mask_tensor = torch.diagonal(attention_mask_tensor[:, 0], dim1=1, dim2=2)
|
| 709 |
-
attention_mask_tensor = attention_mask_tensor / torch.finfo(attention_mask_tensor.dtype).min
|
| 710 |
-
attention_mask_tensor = (1.0 - attention_mask_tensor).int()
|
| 711 |
-
|
| 712 |
-
# Calculate RoPE index once per generation in the pre-fill stage only.
|
| 713 |
-
# When compiling, we can't check tensor values thus we check only input length
|
| 714 |
-
# It is safe to assume that `length!=1` means we're in pre-fill because compiled
|
| 715 |
-
# models currently cannot do asssisted decoding
|
| 716 |
-
prefill_compiled_stage = is_torchdynamo_compiling() and (
|
| 717 |
-
(input_ids is not None and input_ids.shape[1] != 1)
|
| 718 |
-
or (inputs_embeds is not None and inputs_embeds.shape[1] != 1)
|
| 719 |
-
)
|
| 720 |
-
prefill_noncompiled_stage = not is_torchdynamo_compiling() and (
|
| 721 |
-
(cache_position is not None and cache_position[0] == 0)
|
| 722 |
-
or (past_key_values is None or past_key_values.get_seq_length() == 0)
|
| 723 |
-
)
|
| 724 |
-
if (prefill_compiled_stage or prefill_noncompiled_stage) or self.rope_deltas is None:
|
| 725 |
-
position_ids, rope_deltas = self.get_rope_index_with_compare_token(
|
| 726 |
-
input_ids,
|
| 727 |
-
image_grid_thw,
|
| 728 |
-
video_grid_thw,
|
| 729 |
-
second_per_grid_ts=second_per_grid_ts,
|
| 730 |
-
attention_mask=attention_mask_tensor,
|
| 731 |
-
)
|
| 732 |
-
self.rope_deltas = rope_deltas
|
| 733 |
-
# then use the prev pre-calculated rope-deltas to get the correct position ids
|
| 734 |
-
else:
|
| 735 |
-
batch_size, seq_length, _ = inputs_embeds.shape
|
| 736 |
-
delta = (
|
| 737 |
-
(cache_position[0] + self.rope_deltas).to(inputs_embeds.device)
|
| 738 |
-
if cache_position is not None
|
| 739 |
-
else 0
|
| 740 |
-
)
|
| 741 |
-
position_ids = torch.arange(seq_length, device=inputs_embeds.device)
|
| 742 |
-
position_ids = position_ids.view(1, -1).expand(batch_size, -1)
|
| 743 |
-
if cache_position is not None: # otherwise `deltas` is an int `0`
|
| 744 |
-
delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
|
| 745 |
-
position_ids = position_ids.add(delta)
|
| 746 |
-
position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
|
| 747 |
-
|
| 748 |
-
outputs = self.language_model(
|
| 749 |
-
input_ids=None,
|
| 750 |
-
position_ids=position_ids,
|
| 751 |
-
attention_mask=attention_mask,
|
| 752 |
-
past_key_values=past_key_values,
|
| 753 |
-
inputs_embeds=inputs_embeds,
|
| 754 |
-
use_cache=use_cache,
|
| 755 |
-
output_attentions=output_attentions,
|
| 756 |
-
output_hidden_states=output_hidden_states,
|
| 757 |
-
return_dict=True,
|
| 758 |
-
cache_position=cache_position,
|
| 759 |
-
**kwargs,
|
| 760 |
-
)
|
| 761 |
-
|
| 762 |
-
output = Qwen2_5_VLModelOutputWithPast(
|
| 763 |
-
last_hidden_state=outputs.last_hidden_state,
|
| 764 |
-
past_key_values=outputs.past_key_values,
|
| 765 |
-
hidden_states=outputs.hidden_states,
|
| 766 |
-
attentions=outputs.attentions,
|
| 767 |
-
rope_deltas=self.rope_deltas,
|
| 768 |
-
)
|
| 769 |
-
return output if return_dict else output.to_tuple()
|
| 770 |
|
| 771 |
def get_rope_index_with_compare_token(
|
| 772 |
self,
|
|
|
|
| 8 |
from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
|
| 9 |
Qwen2_5_VisionTransformerPretrainedModel,
|
| 10 |
Qwen2_5_VLModel,
|
|
|
|
|
|
|
| 11 |
Qwen2RMSNorm,
|
| 12 |
Qwen2_5_VLMLP,
|
|
|
|
| 13 |
ALL_ATTENTION_FUNCTIONS
|
| 14 |
)
|
| 15 |
from transformers.image_utils import ImageInput
|
|
|
|
| 615 |
# image_embeds = torch.cat(enhanced_image_embeds, dim=0)
|
| 616 |
return enhanced_image_embeds
|
| 617 |
|
| 618 |
+
def get_rope_index(self, input_ids: Optional[torch.LongTensor] = None, image_grid_thw: Optional[torch.LongTensor] = None, video_grid_thw: Optional[torch.LongTensor] = None, second_per_grid_ts: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None) -> tuple[torch.Tensor, torch.Tensor]:
|
| 619 |
+
return self.get_rope_index_with_compare_token(input_ids, image_grid_thw, video_grid_thw, second_per_grid_ts, attention_mask)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 620 |
|
| 621 |
def get_rope_index_with_compare_token(
|
| 622 |
self,
|
tokenizer_config.json
CHANGED
|
@@ -202,12 +202,8 @@
|
|
| 202 |
"eos_token": "<|im_end|>",
|
| 203 |
"errors": "replace",
|
| 204 |
"extra_special_tokens": {},
|
| 205 |
-
"max_length": null,
|
| 206 |
"model_max_length": 131072,
|
| 207 |
-
"pad_to_multiple_of": null,
|
| 208 |
"pad_token": "<|endoftext|>",
|
| 209 |
-
"pad_token_type_id": 0,
|
| 210 |
-
"padding_side": "right",
|
| 211 |
"processor_class": "YangJianProcessor",
|
| 212 |
"split_special_tokens": false,
|
| 213 |
"tokenizer_class": "Qwen2Tokenizer",
|
|
|
|
| 202 |
"eos_token": "<|im_end|>",
|
| 203 |
"errors": "replace",
|
| 204 |
"extra_special_tokens": {},
|
|
|
|
| 205 |
"model_max_length": 131072,
|
|
|
|
| 206 |
"pad_token": "<|endoftext|>",
|
|
|
|
|
|
|
| 207 |
"processor_class": "YangJianProcessor",
|
| 208 |
"split_special_tokens": false,
|
| 209 |
"tokenizer_class": "Qwen2Tokenizer",
|