Spaces:
Running on Zero
Running on Zero
| from transformers import Qwen2_5_VLModel | |
| import torch | |
| from typing import Optional, Union | |
| class QwenImageTextEncoder(torch.nn.Module): | |
| def __init__(self): | |
| super().__init__() | |
| from transformers import Qwen2_5_VLConfig | |
| config = Qwen2_5_VLConfig(**{ | |
| "architectures": [ | |
| "Qwen2_5_VLForConditionalGeneration" | |
| ], | |
| "attention_dropout": 0.0, | |
| "bos_token_id": 151643, | |
| "eos_token_id": 151645, | |
| "hidden_act": "silu", | |
| "hidden_size": 3584, | |
| "image_token_id": 151655, | |
| "initializer_range": 0.02, | |
| "intermediate_size": 18944, | |
| "max_position_embeddings": 128000, | |
| "max_window_layers": 28, | |
| "model_type": "qwen2_5_vl", | |
| "num_attention_heads": 28, | |
| "num_hidden_layers": 28, | |
| "num_key_value_heads": 4, | |
| "rms_norm_eps": 1e-06, | |
| "rope_scaling": { | |
| "mrope_section": [ | |
| 16, | |
| 24, | |
| 24 | |
| ], | |
| "rope_type": "default", | |
| "type": "default" | |
| }, | |
| "rope_theta": 1000000.0, | |
| "sliding_window": 32768, | |
| "text_config": { | |
| "architectures": [ | |
| "Qwen2_5_VLForConditionalGeneration" | |
| ], | |
| "attention_dropout": 0.0, | |
| "bos_token_id": 151643, | |
| "eos_token_id": 151645, | |
| "hidden_act": "silu", | |
| "hidden_size": 3584, | |
| "image_token_id": None, | |
| "initializer_range": 0.02, | |
| "intermediate_size": 18944, | |
| "layer_types": [ | |
| "full_attention", | |
| "full_attention", | |
| "full_attention", | |
| "full_attention", | |
| "full_attention", | |
| "full_attention", | |
| "full_attention", | |
| "full_attention", | |
| "full_attention", | |
| "full_attention", | |
| "full_attention", | |
| "full_attention", | |
| "full_attention", | |
| "full_attention", | |
| "full_attention", | |
| "full_attention", | |
| "full_attention", | |
| "full_attention", | |
| "full_attention", | |
| "full_attention", | |
| "full_attention", | |
| "full_attention", | |
| "full_attention", | |
| "full_attention", | |
| "full_attention", | |
| "full_attention", | |
| "full_attention", | |
| "full_attention" | |
| ], | |
| "max_position_embeddings": 128000, | |
| "max_window_layers": 28, | |
| "model_type": "qwen2_5_vl_text", | |
| "num_attention_heads": 28, | |
| "num_hidden_layers": 28, | |
| "num_key_value_heads": 4, | |
| "rms_norm_eps": 1e-06, | |
| "rope_scaling": { | |
| "mrope_section": [ | |
| 16, | |
| 24, | |
| 24 | |
| ], | |
| "rope_type": "default", | |
| "type": "default" | |
| }, | |
| "rope_theta": 1000000.0, | |
| "sliding_window": None, | |
| "torch_dtype": "float32", | |
| "use_cache": True, | |
| "use_sliding_window": False, | |
| "video_token_id": None, | |
| "vision_end_token_id": 151653, | |
| "vision_start_token_id": 151652, | |
| "vision_token_id": 151654, | |
| "vocab_size": 152064 | |
| }, | |
| "tie_word_embeddings": False, | |
| "torch_dtype": "float32", | |
| "transformers_version": "4.54.0", | |
| "use_cache": True, | |
| "use_sliding_window": False, | |
| "video_token_id": 151656, | |
| "vision_config": { | |
| "depth": 32, | |
| "fullatt_block_indexes": [ | |
| 7, | |
| 15, | |
| 23, | |
| 31 | |
| ], | |
| "hidden_act": "silu", | |
| "hidden_size": 1280, | |
| "in_channels": 3, | |
| "in_chans": 3, | |
| "initializer_range": 0.02, | |
| "intermediate_size": 3420, | |
| "model_type": "qwen2_5_vl", | |
| "num_heads": 16, | |
| "out_hidden_size": 3584, | |
| "patch_size": 14, | |
| "spatial_merge_size": 2, | |
| "spatial_patch_size": 14, | |
| "temporal_patch_size": 2, | |
| "tokens_per_second": 2, | |
| "torch_dtype": "float32", | |
| "window_size": 112 | |
| }, | |
| "vision_end_token_id": 151653, | |
| "vision_start_token_id": 151652, | |
| "vision_token_id": 151654, | |
| "vocab_size": 152064 | |
| }) | |
| self.model = Qwen2_5_VLModel(config) | |
| self.lm_head = torch.nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) | |
| def forward( | |
| self, | |
| input_ids: torch.LongTensor = None, | |
| attention_mask: Optional[torch.Tensor] = None, | |
| position_ids: Optional[torch.LongTensor] = None, | |
| past_key_values = None, | |
| inputs_embeds: Optional[torch.FloatTensor] = None, | |
| labels: Optional[torch.LongTensor] = None, | |
| use_cache: Optional[bool] = None, | |
| output_attentions: Optional[bool] = None, | |
| output_hidden_states: Optional[bool] = None, | |
| pixel_values: Optional[torch.Tensor] = None, | |
| pixel_values_videos: Optional[torch.FloatTensor] = None, | |
| image_grid_thw: Optional[torch.LongTensor] = None, | |
| video_grid_thw: Optional[torch.LongTensor] = None, | |
| rope_deltas: Optional[torch.LongTensor] = None, | |
| cache_position: Optional[torch.LongTensor] = None, | |
| second_per_grid_ts: Optional[torch.Tensor] = None, | |
| logits_to_keep: Union[int, torch.Tensor] = 0, | |
| **kwargs, | |
| ): | |
| r""" | |
| labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): | |
| Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., | |
| config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored | |
| (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. | |
| image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): | |
| The temporal, height and width of feature shape of each image in LLM. | |
| video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*): | |
| The temporal, height and width of feature shape of each video in LLM. | |
| rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*): | |
| The rope index difference between sequence length and multimodal rope. | |
| second_per_grid_ts (`torch.Tensor` of shape `(num_videos)`, *optional*): | |
| The time interval (in seconds) for each grid along the temporal dimension in the 3D position IDs. | |
| Example: | |
| ```python | |
| >>> from PIL import Image | |
| >>> import requests | |
| >>> from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration | |
| >>> model = Qwen2_5_VLForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct") | |
| >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct") | |
| >>> messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image"}, | |
| {"type": "text", "text": "What is shown in this image?"}, | |
| ], | |
| }, | |
| ] | |
| >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg" | |
| >>> image = Image.open(requests.get(url, stream=True).raw) | |
| >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
| >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos]) | |
| >>> # Generate | |
| >>> generate_ids = model.generate(inputs.input_ids, max_length=30) | |
| >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] | |
| "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..." | |
| ```""" | |
| output_attentions = False | |
| output_hidden_states = True | |
| outputs = self.model( | |
| input_ids=input_ids, | |
| pixel_values=pixel_values, | |
| pixel_values_videos=pixel_values_videos, | |
| image_grid_thw=image_grid_thw, | |
| video_grid_thw=video_grid_thw, | |
| second_per_grid_ts=second_per_grid_ts, | |
| position_ids=position_ids, | |
| attention_mask=attention_mask, | |
| past_key_values=past_key_values, | |
| inputs_embeds=inputs_embeds, | |
| use_cache=use_cache, | |
| output_attentions=output_attentions, | |
| output_hidden_states=output_hidden_states, | |
| return_dict=True, | |
| cache_position=cache_position, | |
| **kwargs, | |
| ) | |
| return outputs.hidden_states | |
| def state_dict_converter(): | |
| return QwenImageTextEncoderStateDictConverter() | |
| class QwenImageTextEncoderStateDictConverter(): | |
| def __init__(self): | |
| pass | |
| def from_diffusers(self, state_dict): | |
| state_dict_ = {} | |
| for k, v in state_dict.items(): | |
| if k.startswith("visual."): | |
| k = "model." + k | |
| elif k.startswith("model."): | |
| k = k.replace("model.", "model.language_model.") | |
| state_dict_[k] = v | |
| return state_dict_ | |