| | import math |
| | from dataclasses import dataclass |
| | from typing import List, Optional, Tuple, Union |
| |
|
| | import torch |
| | import torch.nn as nn |
| | import torch.nn.functional as F |
| | from torch.nn import CrossEntropyLoss |
| |
|
| | from transformers.activations import ACT2FN |
| | from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache |
| | from transformers.generation import GenerationMixin |
| | from transformers.modeling_attn_mask_utils import AttentionMaskConverter |
| | from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput |
| | from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS |
| | from transformers.modeling_utils import PreTrainedModel |
| | from transformers.utils import ( |
| | add_start_docstrings, |
| | add_start_docstrings_to_model_forward, |
| | is_flash_attn_2_available, |
| | is_flash_attn_greater_or_equal_2_10, |
| | logging, |
| | replace_return_docstrings, |
| | ) |
| | from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig |
| | from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLForConditionalGeneration, QWEN2_5_VL_INPUTS_DOCSTRING, Qwen2_5_VLCausalLMOutputWithPast |
| |
|
| | if is_flash_attn_2_available(): |
| | from flash_attn import flash_attn_varlen_func |
| | from flash_attn.layers.rotary import apply_rotary_emb |
| |
|
| | else: |
| | flash_attn_varlen_func = None |
| | apply_rotary_emb = None |
| |
|
| |
|
| | if is_flash_attn_2_available(): |
| | from transformers.modeling_flash_attention_utils import _flash_attention_forward |
| | else: |
| | flash_attn_varlen_func = None |
| |
|
| |
|
| | logger = logging.get_logger(__name__) |
| |
|
| | _CONFIG_FOR_DOC = "Qwen2_5_VLConfig" |
| |
|
| |
|
| | class Qwen25VLForEmbedding(Qwen2_5_VLForConditionalGeneration): |
| | _tied_weights_keys = ["lm_head.weight"] |
| | config_class = Qwen2_5_VLConfig |
| | _no_split_modules = ["Qwen2_5_VLDecoderLayer", "Qwen2_5_VLVisionBlock"] |
| |
|
| | @add_start_docstrings_to_model_forward(QWEN2_5_VL_INPUTS_DOCSTRING) |
| | @replace_return_docstrings(output_type=Qwen2_5_VLCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) |
| | def forward( |
| | self, |
| | input_ids: torch.LongTensor = None, |
| | attention_mask: Optional[torch.Tensor] = None, |
| | position_ids: Optional[torch.LongTensor] = None, |
| | past_key_values: Optional[List[torch.FloatTensor]] = None, |
| | inputs_embeds: Optional[torch.FloatTensor] = None, |
| | labels: Optional[torch.LongTensor] = None, |
| | use_cache: Optional[bool] = None, |
| | output_attentions: Optional[bool] = None, |
| | output_hidden_states: Optional[bool] = None, |
| | return_dict: Optional[bool] = None, |
| | pixel_values: Optional[torch.Tensor] = None, |
| | pixel_values_videos: Optional[torch.FloatTensor] = None, |
| | image_grid_thw: Optional[torch.LongTensor] = None, |
| | video_grid_thw: Optional[torch.LongTensor] = None, |
| | rope_deltas: Optional[torch.LongTensor] = None, |
| | cache_position: Optional[torch.LongTensor] = None, |
| | second_per_grid_ts: Optional[torch.Tensor] = None, |
| | ) -> Union[Tuple, Qwen2_5_VLCausalLMOutputWithPast]: |
| | r""" |
| | Args: |
| | labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): |
| | Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., |
| | config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored |
| | (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. |
| | |
| | Returns: |
| | |
| | Example: |
| | |
| | ```python |
| | >>> from PIL import Image |
| | >>> import requests |
| | >>> from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration |
| | |
| | >>> model = Qwen2_5_VLForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct") |
| | >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct") |
| | |
| | >>> messages = [ |
| | { |
| | "role": "user", |
| | "content": [ |
| | {"type": "image"}, |
| | {"type": "text", "text": "What is shown in this image?"}, |
| | ], |
| | }, |
| | ] |
| | >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg" |
| | >>> image = Image.open(requests.get(url, stream=True).raw) |
| | |
| | >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
| | >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos]) |
| | |
| | >>> # Generate |
| | >>> generate_ids = model.generate(inputs.input_ids, max_length=30) |
| | >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] |
| | "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..." |
| | ```""" |
| |
|
| | output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions |
| | output_hidden_states = ( |
| | output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states |
| | ) |
| | return_dict = return_dict if return_dict is not None else self.config.use_return_dict |
| |
|
| | if inputs_embeds is None: |
| | inputs_embeds = self.model.embed_tokens(input_ids) |
| | if pixel_values is not None: |
| | pixel_values = pixel_values.type(self.visual.dtype) |
| | image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw) |
| | n_image_tokens = (input_ids == self.config.image_token_id).sum().item() |
| | n_image_features = image_embeds.shape[0] |
| | if n_image_tokens != n_image_features: |
| | raise ValueError( |
| | f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" |
| | ) |
| |
|
| | mask = input_ids == self.config.image_token_id |
| | mask_unsqueezed = mask.unsqueeze(-1) |
| | mask_expanded = mask_unsqueezed.expand_as(inputs_embeds) |
| | image_mask = mask_expanded.to(inputs_embeds.device) |
| |
|
| | image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype) |
| | inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds) |
| |
|
| | if pixel_values_videos is not None: |
| | pixel_values_videos = pixel_values_videos.type(self.visual.dtype) |
| | video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw) |
| | n_video_tokens = (input_ids == self.config.video_token_id).sum().item() |
| | n_video_features = video_embeds.shape[0] |
| | if n_video_tokens != n_video_features: |
| | raise ValueError( |
| | f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}" |
| | ) |
| |
|
| | mask = input_ids == self.config.video_token_id |
| | mask_unsqueezed = mask.unsqueeze(-1) |
| | mask_expanded = mask_unsqueezed.expand_as(inputs_embeds) |
| | video_mask = mask_expanded.to(inputs_embeds.device) |
| |
|
| | video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype) |
| | inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds) |
| |
|
| | if attention_mask is not None: |
| | attention_mask = attention_mask.to(inputs_embeds.device) |
| |
|
| | |
| | if position_ids is None and (attention_mask is None or attention_mask.ndim == 2): |
| | |
| | if (cache_position is not None and cache_position[0] == 0) or self.rope_deltas is None: |
| | position_ids, rope_deltas = self.get_rope_index( |
| | input_ids, |
| | image_grid_thw, |
| | video_grid_thw, |
| | second_per_grid_ts, |
| | attention_mask, |
| | ) |
| | self.rope_deltas = rope_deltas |
| | |
| | else: |
| | batch_size, seq_length, _ = inputs_embeds.shape |
| | delta = ( |
| | (cache_position[0] + self.rope_deltas).to(inputs_embeds.device) |
| | if cache_position is not None |
| | else 0 |
| | ) |
| | position_ids = torch.arange(seq_length, device=inputs_embeds.device) |
| | position_ids = position_ids.view(1, -1).expand(batch_size, -1) |
| | if cache_position is not None: |
| | delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0) |
| | position_ids = position_ids.add(delta) |
| | position_ids = position_ids.unsqueeze(0).expand(3, -1, -1) |
| |
|
| | outputs = self.model( |
| | input_ids=None, |
| | position_ids=position_ids, |
| | attention_mask=attention_mask, |
| | past_key_values=past_key_values, |
| | inputs_embeds=inputs_embeds, |
| | use_cache=use_cache, |
| | output_attentions=output_attentions, |
| | output_hidden_states=output_hidden_states, |
| | return_dict=return_dict, |
| | cache_position=cache_position, |
| | ) |
| | return outputs |
| |
|