diff --git a/.gitattributes b/.gitattributes index 4693aff96530df800098c4e9b1871bc63e6bdb1f..c16337ad34b115a91e34c1f71498c9d32bd70bb9 100644 --- a/.gitattributes +++ b/.gitattributes @@ -427,3 +427,9 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/ .venv/lib/python3.11/site-packages/transformers/generation/__pycache__/tf_utils.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text .venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_utils.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text .venv/lib/python3.11/site-packages/transformers/__pycache__/trainer.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +.venv/lib/python3.11/site-packages/transformers/__pycache__/tokenization_utils_base.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +.venv/lib/python3.11/site-packages/transformers/__pycache__/cache_utils.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_outputs.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_tf_utils.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +.venv/lib/python3.11/site-packages/transformers/__pycache__/training_args.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +.venv/lib/python3.11/site-packages/transformers/__pycache__/testing_utils.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/cache_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/cache_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b7bf6de6d3a72003f00715113ea1907b1c80962f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/__pycache__/cache_utils.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac78e0a7f936cac1a2823c1bfc4b77a84e4387d40c6e7aa7159c3ec3c687948c +size 117848 diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_outputs.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_outputs.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0fba2a87838d94f1278879b4318f29cbe8e7756d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_outputs.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0523870630a2869713edfeb509a7e8b578c6d12b60f570df1980ff8967248fd2 +size 127236 diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_tf_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_tf_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..af07a0be0e64e30ca104f91771692951e12badb9 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modeling_tf_utils.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e15e9fd16dfb7cbb991cc343b35714ca64d0a7b2e0f5c89d63a9ff6a90e90725 +size 175992 diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/testing_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/testing_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ce64d521b85098585608e72e18106709ae49d818 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/__pycache__/testing_utils.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b493620b1fc9b0aaf24187c7ad47b18c0ff3b15e71af9535f0d7883ce85b51b4 +size 137012 diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/tokenization_utils_base.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/tokenization_utils_base.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..924f49aa2d30692a655745f07abb0d220d3b4979 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/__pycache__/tokenization_utils_base.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8963a7b087496eb2d4c608e1dac4b1976dcbf7a6e85994fbc68424429714c0f +size 206631 diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/training_args.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/training_args.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2f03d954b70d324c4d6dbdab3d3f2237c253b6a6 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/__pycache__/training_args.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c36e1189b4c27edfd98a2a4a0c43c82f487c0df42bf3f2e26ea86698c5eabd15 +size 165859 diff --git a/.venv/lib/python3.11/site-packages/transformers/models/__init__.py b/.venv/lib/python3.11/site-packages/transformers/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7db328f87af1fbd1bdc11519d769a41f7a34528e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/models/__init__.py @@ -0,0 +1,304 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import ( + albert, + align, + altclip, + aria, + audio_spectrogram_transformer, + auto, + autoformer, + bamba, + bark, + bart, + barthez, + bartpho, + beit, + bert, + bert_generation, + bert_japanese, + bertweet, + big_bird, + bigbird_pegasus, + biogpt, + bit, + blenderbot, + blenderbot_small, + blip, + blip_2, + bloom, + bridgetower, + bros, + byt5, + camembert, + canine, + chameleon, + chinese_clip, + clap, + clip, + clipseg, + clvp, + code_llama, + codegen, + cohere, + cohere2, + colpali, + conditional_detr, + convbert, + convnext, + convnextv2, + cpm, + cpmant, + ctrl, + cvt, + dac, + data2vec, + dbrx, + deberta, + deberta_v2, + decision_transformer, + deformable_detr, + deit, + deprecated, + depth_anything, + detr, + dialogpt, + diffllama, + dinat, + dinov2, + dinov2_with_registers, + distilbert, + dit, + donut, + dpr, + dpt, + efficientnet, + electra, + emu3, + encodec, + encoder_decoder, + ernie, + esm, + falcon, + falcon_mamba, + fastspeech2_conformer, + flaubert, + flava, + fnet, + focalnet, + fsmt, + funnel, + fuyu, + gemma, + gemma2, + git, + glm, + glpn, + gpt2, + gpt_bigcode, + gpt_neo, + gpt_neox, + gpt_neox_japanese, + gpt_sw3, + gptj, + granite, + granitemoe, + grounding_dino, + groupvit, + herbert, + hiera, + hubert, + ibert, + idefics, + idefics2, + idefics3, + ijepa, + imagegpt, + informer, + instructblip, + instructblipvideo, + jamba, + jetmoe, + kosmos2, + layoutlm, + layoutlmv2, + layoutlmv3, + layoutxlm, + led, + levit, + lilt, + llama, + llava, + llava_next, + llava_next_video, + llava_onevision, + longformer, + longt5, + luke, + lxmert, + m2m_100, + mamba, + mamba2, + marian, + markuplm, + mask2former, + maskformer, + mbart, + mbart50, + megatron_bert, + megatron_gpt2, + mgp_str, + mimi, + mistral, + mixtral, + mllama, + mluke, + mobilebert, + mobilenet_v1, + mobilenet_v2, + mobilevit, + mobilevitv2, + modernbert, + moonshine, + moshi, + mpnet, + mpt, + mra, + mt5, + musicgen, + musicgen_melody, + mvp, + myt5, + nemotron, + nllb, + nllb_moe, + nougat, + nystromformer, + olmo, + olmo2, + olmoe, + omdet_turbo, + oneformer, + openai, + opt, + owlv2, + owlvit, + paligemma, + patchtsmixer, + patchtst, + pegasus, + pegasus_x, + perceiver, + persimmon, + phi, + phi3, + phimoe, + phobert, + pix2struct, + pixtral, + plbart, + poolformer, + pop2piano, + prophetnet, + pvt, + pvt_v2, + qwen2, + qwen2_audio, + qwen2_moe, + qwen2_vl, + rag, + recurrent_gemma, + reformer, + regnet, + rembert, + resnet, + roberta, + roberta_prelayernorm, + roc_bert, + roformer, + rt_detr, + rwkv, + sam, + seamless_m4t, + seamless_m4t_v2, + segformer, + seggpt, + sew, + sew_d, + siglip, + speech_encoder_decoder, + speech_to_text, + speecht5, + splinter, + squeezebert, + stablelm, + starcoder2, + superpoint, + swiftformer, + swin, + swin2sr, + swinv2, + switch_transformers, + t5, + table_transformer, + tapas, + textnet, + time_series_transformer, + timesformer, + timm_backbone, + timm_wrapper, + trocr, + tvp, + udop, + umt5, + unispeech, + unispeech_sat, + univnet, + upernet, + video_llava, + videomae, + vilt, + vipllava, + vision_encoder_decoder, + vision_text_dual_encoder, + visual_bert, + vit, + vit_mae, + vit_msn, + vitdet, + vitmatte, + vitpose, + vitpose_backbone, + vits, + vivit, + wav2vec2, + wav2vec2_bert, + wav2vec2_conformer, + wav2vec2_phoneme, + wav2vec2_with_lm, + wavlm, + whisper, + x_clip, + xglm, + xlm, + xlm_roberta, + xlm_roberta_xl, + xlnet, + xmod, + yolos, + yoso, + zamba, + zoedepth, +) diff --git a/.venv/lib/python3.11/site-packages/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py b/.venv/lib/python3.11/site-packages/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..66009fc3ef060507748e7fc5244625f80b25fed2 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py @@ -0,0 +1,665 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Classes to support TF Encoder-Decoder architectures""" + +from __future__ import annotations + +import inspect +import re +import warnings +from typing import Optional, Tuple, Union + +import numpy as np +import tensorflow as tf + +from ...configuration_utils import PretrainedConfig +from ...modeling_tf_outputs import TFBaseModelOutput, TFSeq2SeqLMOutput +from ...modeling_tf_utils import ( + TFCausalLanguageModelingLoss, + TFModelInputType, + TFPreTrainedModel, + get_initializer, + keras, + unpack_inputs, +) +from ...tf_utils import shape_list +from ...utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from ..auto.configuration_auto import AutoConfig +from ..auto.modeling_tf_auto import TFAutoModel, TFAutoModelForCausalLM +from .configuration_encoder_decoder import EncoderDecoderConfig + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "EncoderDecoderConfig" + +DEPRECATION_WARNING = ( + "Version v4.17.0 introduces a better way to train encoder-decoder models by computing the loss inside the" + " encoder-decoder framework rather than in the decoder itself. You may observe training discrepancies if" + " fine-tuning a model trained with versions anterior to 4.17.0. The decoder_input_ids are now created based on the" + " labels, no need to pass them yourself anymore." +) + +ENCODER_DECODER_START_DOCSTRING = r""" + This class can be used to initialize a sequence-to-sequence model with any pretrained autoencoding model as the + encoder and any pretrained autoregressive model as the decoder. The encoder is loaded via + [`~TFAutoModel.from_pretrained`] function and the decoder is loaded via [`~TFAutoModelForCausalLM.from_pretrained`] + function. Cross-attention layers are automatically added to the decoder and should be fine-tuned on a downstream + generative task, like summarization. + + The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation + tasks was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation + Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. Michael Matena, Yanqi + Zhou, Wei Li, Peter J. Liu. + + After such an Encoder Decoder model has been trained/fine-tuned, it can be saved/loaded just like any other models + (see the examples for more information). + + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`EncoderDecoderConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights. +""" + +ENCODER_DECODER_INPUTS_DOCSTRING = r""" + Args: + input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + decoder_input_ids (`np.ndarray` or `tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + Provide for sequence to sequence training to the decoder. Indices can be obtained using + [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for + details. + decoder_attention_mask (`np.ndarray` or `tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + encoder_outputs (`tuple(tuple(tf.Tensor)`, *optional*): + This tuple must consist of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`) + `last_hidden_state` (`tf.Tensor` of shape `({0}, hidden_size)`) is a tensor of hidden-states at the output + of the last layer of the encoder. Used in the cross-attention of the decoder. + past_key_values (`tuple(tuple(tf.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `({0})`. + inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + decoder_inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded + representation. This is useful if you want more control over how to convert `decoder_input_ids` indices + into associated vectors than the model's internal embedding lookup matrix. + labels (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*): + Labels for computing the masked language modeling loss for the decoder. Indices should be in `[-100, 0, + ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + If set to `True`, the model will return a [`~utils.Seq2SeqLMOutput`] instead of a plain tuple. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). + kwargs (*optional*): Remaining dictionary of keyword arguments. Keyword arguments come in two flavors: + + - Without a prefix which will be input as `**encoder_kwargs` for the encoder forward function. + - With a *decoder_* prefix which will be input as `**decoder_kwargs`` for the decoder forward function. +""" + + +def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int): + if pad_token_id is None: + raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.") + pad_token_id = tf.cast(pad_token_id, input_ids.dtype) + + if decoder_start_token_id is None: + raise ValueError("Make sure to set the decoder_start_token_id attribute of the model's configuration.") + decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype) + + start_tokens = tf.fill((shape_list(input_ids)[0], 1), decoder_start_token_id) + shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1) + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids = tf.where( + shifted_input_ids == -100, tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids + ) + + # "Verify that `labels` has only positive values and -100" + assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=input_ids.dtype)) + + # Make sure the assertion op is called by wrapping the result in an identity no-op + with tf.control_dependencies([assert_gte0]): + shifted_input_ids = tf.identity(shifted_input_ids) + + return shifted_input_ids + + +@add_start_docstrings(ENCODER_DECODER_START_DOCSTRING) +class TFEncoderDecoderModel(TFPreTrainedModel, TFCausalLanguageModelingLoss): + r""" + [`TFEncoderDecoderModel`] is a generic model class that will be instantiated as a transformer architecture with one + of the base model classes of the library as encoder and another one as decoder when created with the + [`~TFAutoModel.from_pretrained`] class method for the encoder and [`~TFAutoModelForCausalLM.from_pretrained`] class + method for the decoder. + """ + + config_class = EncoderDecoderConfig + base_model_prefix = "encoder_decoder" + load_weight_prefix = "tf_encoder_decoder_model" + + def __init__( + self, + config: Optional[PretrainedConfig] = None, + encoder: Optional[TFPreTrainedModel] = None, + decoder: Optional[TFPreTrainedModel] = None, + ): + if config is None and (encoder is None or decoder is None): + raise ValueError("Either a configuration or an encoder and a decoder has to be provided.") + if config is None: + config = EncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config) + else: + if not isinstance(config, self.config_class): + raise ValueError(f"config: {config} has to be of type {self.config_class}") + + if config.decoder.cross_attention_hidden_size is not None: + if config.decoder.cross_attention_hidden_size != config.encoder.hidden_size: + raise ValueError( + "If `cross_attention_hidden_size` is specified in the decoder's configuration, it has to be equal" + f" to the encoder's `hidden_size`. Got {config.decoder.cross_attention_hidden_size} for" + f" `config.decoder.cross_attention_hidden_size` and {config.encoder.hidden_size} for" + " `config.encoder.hidden_size`." + ) + + # initialize with config + super().__init__(config) + + if encoder is None: + encoder = TFAutoModel.from_config(config.encoder, name="encoder") + + if decoder is None: + decoder = TFAutoModelForCausalLM.from_config(config.decoder, name="decoder") + + self.encoder = encoder + self.decoder = decoder + + if self.encoder.config.to_dict() != self.config.encoder.to_dict(): + logger.warning( + f"Config of the encoder: {self.encoder.__class__} is overwritten by shared encoder config:" + f" {self.config.encoder}" + ) + if self.decoder.config.to_dict() != self.config.decoder.to_dict(): + logger.warning( + f"Config of the decoder: {self.decoder.__class__} is overwritten by shared decoder config:" + f" {self.config.decoder}" + ) + + # make sure that the individual model's config refers to the shared config + # so that the updates to the config will be synced + self.encoder.config = self.config.encoder + self.decoder.config = self.config.decoder + + # encoder outputs might need to be projected to different dimension for decoder + if ( + self.encoder.config.hidden_size != self.decoder.config.hidden_size + and self.decoder.config.cross_attention_hidden_size is None + ): + self.enc_to_dec_proj = keras.layers.Dense( + units=self.decoder.config.hidden_size, + kernel_initializer=get_initializer(config.encoder.initializer_range), + name="enc_to_dec_proj", + ) + + if self.encoder.get_output_embeddings() is not None: + raise ValueError( + f"The encoder {self.encoder} should not have a LM Head. Please use a model without LM Head" + ) + + decoder_signature = set(inspect.signature(self.decoder.call).parameters.keys()) + if "encoder_hidden_states" not in decoder_signature: + raise ValueError( + "The selected decoder is not prepared for the encoder hidden states to be passed. Please see the " + "following discussion on GitHub: https://github.com/huggingface/transformers/issues/23350" + ) + + def get_encoder(self): + return self.encoder + + def get_decoder(self): + return self.decoder + + def get_input_embeddings(self): + return self.encoder.get_input_embeddings() + + def get_output_embeddings(self): + return self.decoder.get_output_embeddings() + + def set_output_embeddings(self, new_embeddings): + return self.decoder.set_output_embeddings(new_embeddings) + + def tf_to_pt_weight_rename(self, tf_weight): + # Matt: The TF and PT weights don't align because our TF base classes have an extra layer compared to PT models + # (the main model stem is in the MainLayer class). If we remove that layer, then weight names sync up as normal. + # However, the name of that extra layer is the name of the MainLayer in the base model. We make the assumption + # here that the config model_type is the same as the name of the MainLayer. I don't know of anywhere that's + # not the case, and I wasn't sure how else to go from the config to the correct MainLayer name! + + # This override is only needed in the case where we're crossloading weights from PT. However, since weights are + # often safetensors now, we don't know if we're going to be crossloading until we sniff the weights file. + # Therefore, we specify tf_to_pt_weight_rename anyway, and let the super method figure out if it needs it + # or not. + encoder_model_type = self.config.encoder.model_type + if "encoder" in tf_weight and "decoder" not in tf_weight: + return (re.sub(rf"encoder\.{encoder_model_type}\.", "encoder.", tf_weight),) + else: + return (tf_weight,) + + @classmethod + def from_encoder_decoder_pretrained( + cls, + encoder_pretrained_model_name_or_path: str = None, + decoder_pretrained_model_name_or_path: str = None, + *model_args, + **kwargs, + ) -> TFPreTrainedModel: + r""" + Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model + checkpoints. + + + Params: + encoder_pretrained_model_name_or_path (`str`, *optional*): + Information necessary to initiate the encoder. Can be either: + + - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. + - A path to a *directory* containing model weights saved using + [`~TFPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`. + - A path or url to a *pytorch index checkpoint file* (e.g, `./pt_model/`). In this case, + `encoder_from_pt` should be set to `True`. + + decoder_pretrained_model_name_or_path (`str`, *optional*, defaults to `None`): + Information necessary to initiate the decoder. Can be either: + + - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. + - A path to a *directory* containing model weights saved using + [`~TFPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`. + - A path or url to a *pytorch checkpoint file* (e.g, `./pt_model/`). In this case, + `decoder_from_pt` should be set to `True`. + + model_args (remaining positional arguments, *optional*): + All remaning positional arguments will be passed to the underlying model's `__init__` method. + + kwargs (remaining dictionary of keyword arguments, *optional*): + Can be used to update the configuration object (after it being loaded) and initiate the model (e.g., + `output_attentions=True`). + + - To update the encoder configuration, use the prefix *encoder_* for each configuration parameter. + - To update the decoder configuration, use the prefix *decoder_* for each configuration parameter. + - To update the parent model configuration, do not use a prefix for each configuration parameter. + + Behaves differently depending on whether a `config` is provided or automatically loaded. + + Example: + + ```python + >>> from transformers import TFEncoderDecoderModel + + >>> # initialize a bert2gpt2 from two pretrained BERT models. Note that the cross-attention layers will be randomly initialized + >>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-uncased", "openai-community/gpt2") + >>> # saving model after fine-tuning + >>> model.save_pretrained("./bert2gpt2") + >>> # load fine-tuned model + >>> model = TFEncoderDecoderModel.from_pretrained("./bert2gpt2") + ```""" + + kwargs_encoder = { + argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_") + } + + kwargs_decoder = { + argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_") + } + + # remove encoder, decoder kwargs from kwargs + for key in kwargs_encoder.keys(): + del kwargs["encoder_" + key] + for key in kwargs_decoder.keys(): + del kwargs["decoder_" + key] + + # Load and initialize the encoder and decoder + # The distinction between encoder and decoder at the model level is made + # by the value of the flag `is_decoder` that we need to set correctly. + encoder = kwargs_encoder.pop("model", None) + if encoder is None: + if encoder_pretrained_model_name_or_path is None: + raise ValueError( + "If `encoder_model` is not defined as an argument, a `encoder_pretrained_model_name_or_path` has " + "to be defined." + ) + + if "config" not in kwargs_encoder: + encoder_config = AutoConfig.from_pretrained(encoder_pretrained_model_name_or_path) + if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True: + logger.info( + f"Initializing {encoder_pretrained_model_name_or_path} as a encoder model " + "from a decoder model. Cross-attention and casual mask are disabled." + ) + encoder_config.is_decoder = False + encoder_config.add_cross_attention = False + + kwargs_encoder["config"] = encoder_config + + kwargs_encoder["name"] = "encoder" + kwargs_encoder["load_weight_prefix"] = cls.load_weight_prefix + encoder = TFAutoModel.from_pretrained(encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder) + + decoder = kwargs_decoder.pop("model", None) + if decoder is None: + if decoder_pretrained_model_name_or_path is None: + raise ValueError( + "If `decoder_model` is not defined as an argument, a `decoder_pretrained_model_name_or_path` has " + "to be defined." + ) + + if "config" not in kwargs_decoder: + decoder_config = AutoConfig.from_pretrained(decoder_pretrained_model_name_or_path) + if decoder_config.is_decoder is False or decoder_config.add_cross_attention is False: + logger.info( + f"Initializing {decoder_pretrained_model_name_or_path} as a decoder model. Cross attention" + f" layers are added to {decoder_pretrained_model_name_or_path} and randomly initialized if" + f" {decoder_pretrained_model_name_or_path}'s architecture allows for cross attention layers." + ) + decoder_config.is_decoder = True + decoder_config.add_cross_attention = True + + kwargs_decoder["config"] = decoder_config + + if kwargs_decoder["config"].is_decoder is False or kwargs_decoder["config"].add_cross_attention is False: + logger.warning( + f"Decoder model {decoder_pretrained_model_name_or_path} is not initialized as a decoder. " + f"In order to initialize {decoder_pretrained_model_name_or_path} as a decoder, " + "make sure that the attributes `is_decoder` and `add_cross_attention` of `decoder_config` " + "passed to `.from_encoder_decoder_pretrained(...)` are set to `True` or do not pass a " + "`decoder_config` to `.from_encoder_decoder_pretrained(...)`" + ) + + kwargs_decoder["name"] = "decoder" + kwargs_decoder["load_weight_prefix"] = cls.load_weight_prefix + decoder = TFAutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder) + + # Make sure these 2 `keras.Model` have fixed names so `from_pretrained` could load model weights correctly. + if encoder.name != "encoder": + raise ValueError("encoder model must be created with the name `encoder`.") + if decoder.name != "decoder": + raise ValueError("decoder model must be created with the name `decoder`.") + + # instantiate config with corresponding kwargs + config = EncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config, **kwargs) + return cls(encoder=encoder, decoder=decoder, config=config) + + @unpack_inputs + @add_start_docstrings_to_model_forward(ENCODER_DECODER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + decoder_input_ids: np.ndarray | tf.Tensor | None = None, + decoder_attention_mask: np.ndarray | tf.Tensor | None = None, + encoder_outputs: np.ndarray | tf.Tensor | None = None, + past_key_values: Tuple[Tuple[tf.Tensor]] | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None, + labels: np.ndarray | tf.Tensor | None = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + **kwargs, + ) -> Union[TFSeq2SeqLMOutput, Tuple[tf.Tensor]]: + r""" + Returns: + + Examples: + + ```python + >>> from transformers import TFEncoderDecoderModel, BertTokenizer + + >>> # initialize a bert2gpt2 from a pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized + >>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-cased", "openai-community/gpt2") + + >>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased") + + >>> # forward + >>> input_ids = tokenizer.encode( + ... "Hello, my dog is cute", add_special_tokens=True, return_tensors="tf" + ... ) # Batch size 1 + >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids) + + >>> # training + >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids) + >>> loss, logits = outputs.loss, outputs.logits + + >>> # save and load from pretrained + >>> model.save_pretrained("bert2gpt2") + >>> model = TFEncoderDecoderModel.from_pretrained("bert2gpt2") + + >>> # generation + >>> generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.bos_token_id) + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + kwargs_encoder = {argument: value for argument, value in kwargs.items() if not argument.startswith("decoder_")} + + kwargs_decoder = { + argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_") + } + + # Let the user be responsible for the expected format. + if encoder_outputs is not None: + if return_dict and not isinstance(encoder_outputs, ModelOutput): + raise ValueError( + "If `return_dict=True` and `encoder_outputs` is provided, it should be an instance of " + f"`ModelOutput`. Got an instance {type(encoder_outputs)} for `encoder_outputs`." + ) + + if encoder_outputs is None: + encoder_inputs = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "inputs_embeds": inputs_embeds, + "output_attentions": output_attentions, + "output_hidden_states": output_hidden_states, + "return_dict": return_dict, + "training": training, + } + + # Add arguments to encoder from `kwargs_encoder` + encoder_inputs.update(kwargs_encoder) + + # Handle the case where the inputs are passed as a single dict which contains `labels`. + # The `labels` shouldn't be passed to `self.encoder` below, because it is a based model without this + # parameter (otherwise, an error occurs when `input_processing` is called inside `self.encoder.call()`). + if "labels" in encoder_inputs: + labels = encoder_inputs.pop("labels") + + # handle the init case where `dummy_inputs` returns a dict containing `decoder_input_ids`. + if "decoder_input_ids" in encoder_inputs: + decoder_input_ids = encoder_inputs.pop("decoder_input_ids") + # handle the init case where `dummy_inputs` returns a dict containing `decoder_input_ids`. + if "decoder_attention_mask" in encoder_inputs: + decoder_attention_mask = encoder_inputs.pop("decoder_attention_mask") + + encoder_outputs = self.encoder(**encoder_inputs) + + encoder_hidden_states = encoder_outputs[0] + + # optionally project encoder_hidden_states + if ( + self.encoder.config.hidden_size != self.decoder.config.hidden_size + and self.decoder.config.cross_attention_hidden_size is None + ): + encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states) + + if (labels is not None) and (decoder_input_ids is None and decoder_inputs_embeds is None): + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + decoder_inputs = { + "input_ids": decoder_input_ids, + "attention_mask": decoder_attention_mask, + "encoder_hidden_states": encoder_hidden_states, + "encoder_attention_mask": attention_mask, + "inputs_embeds": decoder_inputs_embeds, + "output_attentions": output_attentions, + "output_hidden_states": output_hidden_states, + "use_cache": use_cache, + "past_key_values": past_key_values, + "return_dict": return_dict, + "training": training, + } + + # Add arguments to decoder from `kwargs_decoder` + decoder_inputs.update(kwargs_decoder) + + decoder_outputs = self.decoder(**decoder_inputs) + + logits = decoder_outputs[0] + + # Compute loss independent from decoder (as some shift the logits inside them) + loss = None + if labels is not None: + warnings.warn(DEPRECATION_WARNING, FutureWarning) + loss = self.hf_compute_loss(labels, logits) + + if not return_dict: + past_key_values = None + if use_cache: + past_key_values = decoder_outputs[1] + # The starting index of the remaining elements in `decoder_outputs` + start_index = sum([1 if x is not None else 0 for x in (loss, logits, past_key_values)]) + + if not isinstance(encoder_outputs, tuple): + encoder_outputs = encoder_outputs.to_tuple() + output = (loss, logits, past_key_values) + decoder_outputs[start_index:] + encoder_outputs + output = tuple([x for x in output if x is not None]) + return output + + return TFSeq2SeqLMOutput( + loss=loss, + logits=decoder_outputs.logits, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs + ): + decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids, past_key_values=past_key_values) + decoder_attention_mask = decoder_inputs["attention_mask"] if "attention_mask" in decoder_inputs else None + past_key_values = decoder_inputs.get("past_key_values") + if past_key_values is None: + past_key_values = decoder_inputs.get("past") # e.g. on TF GPT2 + input_dict = { + "input_ids": None, # needs to be passed to make Keras.layer.__call__ happy + "attention_mask": attention_mask, + "decoder_attention_mask": decoder_attention_mask, + "decoder_input_ids": decoder_inputs["input_ids"], + # TODO (joao): the `TFBaseModelOutput` wrapper should not be needed after the generate refactor is complete + "encoder_outputs": TFBaseModelOutput(last_hidden_state=encoder_outputs[0]), + "past_key_values": past_key_values, + "use_cache": use_cache, + } + return input_dict + + def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor): + return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) + + def resize_token_embeddings(self, *args, **kwargs): + raise NotImplementedError( + "Resizing the embedding layers via the TFEncoderDecoderModel directly is not supported.Please use the" + " respective methods of the wrapped objects (model.encoder.resize_token_embeddings(...) or" + " model.decoder.resize_token_embeddings(...))" + ) + + def _reorder_cache(self, past, beam_idx): + # apply decoder cache reordering here + return self.decoder._reorder_cache(past, beam_idx) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "enc_to_dec_proj", None) is not None: + with tf.name_scope(self.enc_to_dec_proj.name): + self.enc_to_dec_proj.build([None, None, self.encoder.config.hidden_size]) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "decoder", None) is not None: + with tf.name_scope(self.decoder.name): + self.decoder.build(None) + + +__all__ = ["TFEncoderDecoderModel"] diff --git a/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/__init__.py b/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0369165da24d756e8d8043e3d9e2d5c95857fc38 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_llava_onevision import * + from .image_processing_llava_onevision import * + from .modeling_llava_onevision import * + from .processing_llava_onevision import * + from .video_processing_llava_onevision import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b3a9032d0cbde38ef5605950814f479c7a9f8f6a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/__pycache__/configuration_llava_onevision.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/__pycache__/configuration_llava_onevision.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d2ee98a835f7234fe28a6b8b9c41594b0dfbb50e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/__pycache__/configuration_llava_onevision.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/__pycache__/image_processing_llava_onevision.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/__pycache__/image_processing_llava_onevision.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b43fcc7b8da39427207b947ab1041eba80e2cc1c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/__pycache__/image_processing_llava_onevision.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/__pycache__/modeling_llava_onevision.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/__pycache__/modeling_llava_onevision.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4792c08fe60a0361b7df93336a5398b5d0e84288 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/__pycache__/modeling_llava_onevision.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/__pycache__/processing_llava_onevision.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/__pycache__/processing_llava_onevision.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6cb2b14cf8575d62ac1d1d9a7537d018018b014f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/__pycache__/processing_llava_onevision.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/__pycache__/video_processing_llava_onevision.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/__pycache__/video_processing_llava_onevision.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..34d872f72fa7ba05936b90246415185c393a5688 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/__pycache__/video_processing_llava_onevision.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/configuration_llava_onevision.py b/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/configuration_llava_onevision.py new file mode 100644 index 0000000000000000000000000000000000000000..504e8a7878be40f0a2123fddb45dea1eb7433ad7 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/configuration_llava_onevision.py @@ -0,0 +1,190 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from ...configuration_utils import PretrainedConfig +from ...utils import ( + logging, +) +from ..auto import CONFIG_MAPPING, AutoConfig + + +logger = logging.get_logger(__name__) + + +class LlavaOnevisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`LlavaOnevisionForConditionalGeneration`]. It is used to instantiate an + Llava-NeXT model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the [llava-hf/llava-onevision-qwen2-7b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-7b-ov-hf) + model. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SiglipVisionConfig`): + The config object or dictionary of the vision backbone. + text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Qwen2Config`): + The config object or dictionary of the text backbone. + image_token_index (`int`, *optional*, defaults to 151646): + The image token index to encode the image prompt. + video_token_index (`int`, *optional*, defaults to 151647): + The video token index to encode the video prompt. + projector_hidden_act (`str`, *optional*, defaults to `"gelu"`): + The activation function used by the multimodal projector. + vision_feature_select_strategy (`str`, *optional*, defaults to `"full"`): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features. + If `"full"`, the full vision features are used. + vision_feature_layer (`int`, *optional*, defaults to -1): + The index of the layer to select the vision feature. + vision_aspect_ratio (`str`, *optional*, defaults to `"anyres_max_9"`): + Aspect ratio used when processong image features. The default value is "anyres_max_9". + image_grid_pinpoints (`List`, *optional*): + A list of possible resolutions to use for processing high resolution images. Each item in the list should be a tuple or list + of the form `(height, width)`. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. + multimodal_projector_bias (`bool`, *optional*, defaults to `True`): + Whether to use bias in the multimodal projector. + + Example: + + ```python + >>> from transformers import LlavaOnevisionForConditionalGeneration, LlavaOnevisionConfig, SiglipVisionConfig, Qwen2Config + + >>> # Initializing a CLIP-vision config + >>> vision_config = SiglipVisionConfig() + + >>> # Initializing a Llama config + >>> text_config = Qwen2Config() + + >>> # Initializing a Llava-Next llava-hf/llava-onevision-qwen2-7b-ov-hf style configuration + >>> configuration = LlavaOnevisionConfig(vision_config, text_config) + + >>> # Initializing a model from the llava-hf/llava-onevision-qwen2-7b-ov-hf style configuration + >>> model = LlavaOnevisionForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "llava_onevision" + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} + + def __init__( + self, + vision_config=None, + text_config=None, + image_token_index=151646, + video_token_index=151647, + projector_hidden_act="gelu", + vision_feature_select_strategy="full", + vision_feature_layer=-1, + vision_aspect_ratio="anyres_max_9", + image_grid_pinpoints=None, + tie_word_embeddings=False, + multimodal_projector_bias=True, + **kwargs, + ): + self.image_token_index = image_token_index + self.video_token_index = video_token_index + self.projector_hidden_act = projector_hidden_act + self.multimodal_projector_bias = multimodal_projector_bias + + if vision_feature_select_strategy not in ["default", "full"]: + raise ValueError( + "vision_feature_select_strategy should be one of 'default', 'full'." + f"Got: {vision_feature_select_strategy}" + ) + + self.vision_feature_select_strategy = vision_feature_select_strategy + self.vision_feature_layer = vision_feature_layer + self.vision_aspect_ratio = vision_aspect_ratio + image_grid_pinpoints = ( + image_grid_pinpoints + if image_grid_pinpoints is not None + else [ + [384, 384], + [384, 768], + [384, 1152], + [384, 1536], + [384, 1920], + [384, 2304], + [768, 384], + [768, 768], + [768, 1152], + [768, 1536], + [768, 1920], + [768, 2304], + [1152, 384], + [1152, 768], + [1152, 1152], + [1152, 1536], + [1152, 1920], + [1152, 2304], + [1536, 384], + [1536, 768], + [1536, 1152], + [1536, 1536], + [1536, 1920], + [1536, 2304], + [1920, 384], + [1920, 768], + [1920, 1152], + [1920, 1536], + [1920, 1920], + [1920, 2304], + [2304, 384], + [2304, 768], + [2304, 1152], + [2304, 1536], + [2304, 1920], + [2304, 2304], + ] + ) + self.image_grid_pinpoints = image_grid_pinpoints + + if isinstance(vision_config, dict): + vision_config["model_type"] = ( + vision_config["model_type"] if "model_type" in vision_config else "siglip_vision_model" + ) + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + elif vision_config is None: + vision_config = CONFIG_MAPPING["siglip_vision_model"]( + hidden_size=1152, + intermediate_size=4304, + patch_size=14, + image_size=384, + num_hidden_layers=26, + num_attention_heads=14, + vision_use_head=False, + ) + + self.vision_config = vision_config + + if isinstance(text_config, dict): + text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "qwen2" + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + elif text_config is None: + text_config = CONFIG_MAPPING["qwen2"]() + + self.text_config = text_config + + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + + +__all__ = ["LlavaOnevisionConfig"] diff --git a/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/image_processing_llava_onevision.py b/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/image_processing_llava_onevision.py new file mode 100644 index 0000000000000000000000000000000000000000..75581d25aefaa4b8a87ea1a1111df6d7f2d736d6 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/image_processing_llava_onevision.py @@ -0,0 +1,715 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for LLaVa-Onevision.""" + +import math +from typing import Dict, Iterable, List, Optional, Tuple, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict, select_best_resolution +from ...image_transforms import ( + PaddingMode, + convert_to_rgb, + pad, + resize, + to_channel_dimension_format, +) +from ...image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_scaled_image, + is_valid_image, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType, is_vision_available, logging + + +logger = logging.get_logger(__name__) + + +if is_vision_available(): + from PIL import Image + + +# Copied from transformers.models.llava_next.image_processing_llava_next.make_batched_images +def make_batched_images(images) -> List[List[ImageInput]]: + """ + Accepts images in list or nested list format, and makes a list of images for preprocessing. + + Args: + images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`): + The input image. + + Returns: + list: A list of images. + """ + if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]): + return [img for img_list in images for img in img_list] + + elif isinstance(images, (list, tuple)) and is_valid_image(images[0]): + return images + + elif is_valid_image(images): + return [images] + + raise ValueError(f"Could not make batched video from {images}") + + +# Copied from transformers.models.llava_next.image_processing_llava_next.divide_to_patches +def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> List[np.array]: + """ + Divides an image into patches of a specified size. + + Args: + image (`np.array`): + The input image. + patch_size (`int`): + The size of each patch. + input_data_format (`ChannelDimension` or `str`): + The channel dimension format of the input image. + + Returns: + list: A list of np.array representing the patches. + """ + patches = [] + height, width = get_image_size(image, channel_dim=input_data_format) + for i in range(0, height, patch_size): + for j in range(0, width, patch_size): + if input_data_format == ChannelDimension.LAST: + patch = image[i : i + patch_size, j : j + patch_size] + else: + patch = image[:, i : i + patch_size, j : j + patch_size] + patches.append(patch) + + return patches + + +# Copied from transformers.models.llava_next.image_processing_llava_next.expand_to_square +def expand_to_square(image: np.array, background_color, input_data_format) -> np.array: + """ + Expands an image to a square by adding a background color. + """ + + height, width = get_image_size(image, channel_dim=input_data_format) + if width == height: + return image + elif width > height: + result = np.ones((width, width, image.shape[2]), dtype=image.dtype) * background_color + result[(width - height) // 2 : (width - height) // 2 + height, :] = image + return result + else: + result = np.ones((height, height, image.shape[2]), dtype=image.dtype) * background_color + result[:, (height - width) // 2 : (height - width) // 2 + width] = image + return result + + +# Copied from transformers.models.llava_next.image_processing_llava_next._get_patch_output_size +def _get_patch_output_size(image, target_resolution, input_data_format): + original_height, original_width = get_image_size(image, channel_dim=input_data_format) + target_height, target_width = target_resolution + + scale_w = target_width / original_width + scale_h = target_height / original_height + + if scale_w < scale_h: + new_width = target_width + new_height = min(math.ceil(original_height * scale_w), target_height) + else: + new_height = target_height + new_width = min(math.ceil(original_width * scale_h), target_width) + + return new_height, new_width + + +class LlavaOnevisionImageProcessor(BaseImageProcessor): + r""" + Constructs a LLaVa-Onevisino-Video video processor. Based on [`SiglipImageProcessor`] with incorporation of processing each video frame. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by + `do_resize` in the `preprocess` method. + size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`): + Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess` + method. + image_grid_pinpoints (`List` *optional*, defaults to `[[672, 336], [336, 672], [672, 672], [336, 1008], [1008, 336]]`): + A list of possible resolutions to use for processing high resolution images. The best resolution is selected + based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess` + method. Not used for processinf videos. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in + the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess` + method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method. + image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + do_pad (`bool`, *optional*, defaults to `True`): + Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest + number of patches in the batch. Padding will be applied to the bottom and right with zeros. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + """ + + model_input_names = ["pixel_values_videos"] + + def __init__( + self, + do_resize: bool = True, + size: Dict[str, int] = None, + image_grid_pinpoints: List = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_pad: Optional[bool] = True, + do_convert_rgb: bool = True, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"height": 384, "width": 384} + size = get_size_dict(size, default_to_square=False) + image_grid_pinpoints = ( + image_grid_pinpoints + if image_grid_pinpoints is not None + else [ + [384, 384], + [384, 768], + [384, 1152], + [384, 1536], + [384, 1920], + [384, 2304], + [768, 384], + [768, 768], + [768, 1152], + [768, 1536], + [768, 1920], + [768, 2304], + [1152, 384], + [1152, 768], + [1152, 1152], + [1152, 1536], + [1152, 1920], + [1152, 2304], + [1536, 384], + [1536, 768], + [1536, 1152], + [1536, 1536], + [1536, 1920], + [1536, 2304], + [1920, 384], + [1920, 768], + [1920, 1152], + [1920, 1536], + [1920, 1920], + [1920, 2304], + [2304, 384], + [2304, 768], + [2304, 1152], + [2304, 1536], + [2304, 1920], + [2304, 2304], + ] + ) + + self.do_resize = do_resize + self.size = size + self.image_grid_pinpoints = image_grid_pinpoints + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN + self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD + self.do_pad = do_pad + self.do_convert_rgb = do_convert_rgb + + # Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor.pad + def pad( + self, + image: np.ndarray, + padding: Union[int, Tuple[int, int], Iterable[Tuple[int, int]]], + mode: PaddingMode = PaddingMode.CONSTANT, + constant_values: Union[float, Iterable[float]] = 0.0, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Pads the `image` with the specified `padding` and `mode`. Padding can be in the (`height`, `width`) + dimension of in the (`num_patches`) dimension. In the second case an iterable if tuples is expected + as input. + + Args: + image (`np.ndarray`): + The image to pad. + padding (`int` or `Tuple[int, int]` or `Iterable[Tuple[int, int]]`): + Padding to apply to the edges of the height, width axes. Can be one of three formats: + - `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis. + - `((before, after),)` yields same before and after pad for height and width. + - `(pad,)` or int is a shortcut for before = after = pad width for all axes. + mode (`PaddingMode`): + The padding mode to use. Can be one of: + - `"constant"`: pads with a constant value. + - `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the + vector along each axis. + - `"replicate"`: pads with the replication of the last value on the edge of the array along each axis. + - `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array. + constant_values (`float` or `Iterable[float]`, *optional*): + The value to use for the padding if `mode` is `"constant"`. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use same as the input image. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use the inferred format of the input image. + + Returns: + `np.ndarray`: The padded image. + + """ + + # call the general `pad` if padding on `height/width`, otherwise it's the `num_patched` dim + if isinstance(padding, int) or len(padding) != 4: + return pad(image, padding, mode, constant_values, data_format, input_data_format) + + if input_data_format is None: + input_data_format = infer_channel_dimension_format(image) + if mode == PaddingMode.CONSTANT: + image = np.pad(image, padding, mode="constant", constant_values=constant_values) + elif mode == PaddingMode.REFLECT: + image = np.pad(image, padding, mode="reflect") + elif mode == PaddingMode.REPLICATE: + image = np.pad(image, padding, mode="edge") + elif mode == PaddingMode.SYMMETRIC: + image = np.pad(image, padding, mode="symmetric") + else: + raise ValueError(f"Invalid padding mode: {mode}") + image = ( + to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image + ) + return image + + # Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor._resize_for_patching + def _resize_for_patching( + self, image: np.array, target_resolution: tuple, resample, input_data_format: ChannelDimension + ) -> np.array: + """ + Resizes an image to a target resolution while maintaining aspect ratio. + + Args: + image (np.array): + The input image. + target_resolution (tuple): + The target resolution (height, width) of the image. + resample (`PILImageResampling`): + Resampling filter to use if resizing the image. + input_data_format (`ChannelDimension` or `str`): + The channel dimension format of the input image. + + Returns: + np.array: The resized and padded image. + """ + new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format) + + # Resize the image + resized_image = resize(image, (new_height, new_width), resample=resample, input_data_format=input_data_format) + + return resized_image + + # Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor._pad_for_patching + def _pad_for_patching( + self, image: np.array, target_resolution: tuple, input_data_format: ChannelDimension + ) -> np.array: + """ + Pad an image to a target resolution while maintaining aspect ratio. + """ + target_height, target_width = target_resolution + new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format) + + paste_x = (target_width - new_width) // 2 + paste_y = (target_height - new_height) // 2 + + padded_image = self.pad(image, padding=((paste_y, paste_y), (paste_x, paste_x))) + + return padded_image + + # Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor.get_image_patches + def get_image_patches( + self, + image: np.array, + grid_pinpoints, + size: tuple, + patch_size: int, + resample: PILImageResampling, + data_format: ChannelDimension, + input_data_format: ChannelDimension, + ) -> List[np.array]: + """ + Process an image with variable resolutions by dividing it into patches. + + Args: + image (np.array): + The input image to be processed. + grid_pinpoints (List): + A string representation of a list of possible resolutions. + size (`tuple`): + Size to resize the original image to. + patch_size (`int`): + Size of the patches to divide the image into. + resample (`PILImageResampling`): + Resampling filter to use if resizing the image. + data_format (`ChannelDimension` or `str`): + The channel dimension format for the output image. + input_data_format (`ChannelDimension` or `str`): + The channel dimension format of the input image. + + Returns: + List[np.array]: A list of NumPy arrays containing the processed image patches. + """ + if not isinstance(grid_pinpoints, list): + raise TypeError("grid_pinpoints must be a list of possible resolutions.") + + possible_resolutions = grid_pinpoints + + image_size = get_image_size(image, channel_dim=input_data_format) + best_resolution = select_best_resolution(image_size, possible_resolutions) + resized_image = self._resize_for_patching( + image, best_resolution, resample=resample, input_data_format=input_data_format + ) + padded_image = self._pad_for_patching(resized_image, best_resolution, input_data_format=input_data_format) + + patches = divide_to_patches(padded_image, patch_size=patch_size, input_data_format=input_data_format) + + # make sure that all patches are in the input data format + patches = [ + to_channel_dimension_format(patch, channel_dim=data_format, input_channel_dim=input_data_format) + for patch in patches + ] + + resized_original_image = resize( + image, + size=size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + ) + + image_patches = [resized_original_image] + patches + + return image_patches + + # Copied from transformers.models.llava_next.image_processing_llava_next.LlavaNextImageProcessor._pad_for_batching + def _pad_for_batching( + self, + pixel_values: List[np.ndarray], + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + """ + Pads images on the `num_of_patches` dimension with zeros to form a batch of same number of patches. + + Args: + pixel_values (`List[np.ndarray]`): + An array of pixel values of each images of shape (`batch_size`, `num_patches`, `image_in_3D`) + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use same as the input image. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use the inferred format of the input image. + + Returns: + List[`np.ndarray`]: The padded images. + """ + max_patch = max(len(x) for x in pixel_values) + pixel_values = [ + self.pad( + image, + padding=((0, max_patch - image.shape[0]), (0, 0), (0, 0), (0, 0)), + data_format=data_format, + input_data_format=input_data_format, + ) + for image in pixel_values + ] + + return pixel_values + + def _preprocess( + self, + images: ImageInput, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_rescale: bool = None, + rescale_factor: float = None, + do_normalize: bool = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> Image.Image: + """ + Args: + images (`ImageInput`): + Batch of frames (one video) to preprocess. Expects a batch of frames with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only + has an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to + `True`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + if do_resize: + images = [ + resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + for image in images + ] + + if do_rescale: + images = [ + self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + for image in images + ] + + if do_normalize: + images = [ + self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format) + for image in images + ] + + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images + ] + + return images + + def preprocess( + self, + images: ImageInput, + do_resize: bool = None, + size: Dict[str, int] = None, + image_grid_pinpoints: List = None, + resample: PILImageResampling = None, + do_rescale: bool = None, + rescale_factor: float = None, + do_normalize: bool = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_pad: Optional[bool] = None, + do_convert_rgb: bool = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + """ + Args: + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. + image_grid_pinpoints (`List` *optional*, defaults to `self.image_grid_pinpoints`): + A list of possible resolutions to use for processing high resolution images. The best resolution is + selected based on the original size of the image. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only + has an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to + `True`. + do_pad (`bool`, *optional*, defaults to `self.do_pad`): + Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest + number of patches in the batch. Padding will be applied to the bottom and right with zeros. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + + """ + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + size = get_size_dict(size, default_to_square=False) + image_grid_pinpoints = image_grid_pinpoints if image_grid_pinpoints is not None else self.image_grid_pinpoints + resample = resample if resample is not None else self.resample + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_pad = do_pad if do_pad is not None else self.do_pad + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + + images = make_batched_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + new_images = [] + image_sizes = [get_image_size(image, channel_dim=input_data_format) for image in images] + for image in images: + # convert image into a list of patches + # we intentially use the same data format as the input data format + size_tuple = ( + (size["height"], size["width"]) + if "height" in size and "width" in size + else (size["shortest_edge"], size["shortest_edge"]) + ) + image_patches = self.get_image_patches( + image, + image_grid_pinpoints, + size=size_tuple, + patch_size=size["height"], + resample=resample, + data_format=input_data_format, + input_data_format=input_data_format, + ) + + # preprocess patches + pixel_values = self._preprocess( + image_patches, + do_resize=do_resize, + size=size_tuple, + resample=resample, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + data_format=data_format, + input_data_format=input_data_format, + ) + pixel_values = np.array(pixel_values) + new_images.append(pixel_values) + + if do_pad: + processed_images = self._pad_for_batching(new_images) + + return BatchFeature( + data={"pixel_values": processed_images, "image_sizes": image_sizes}, tensor_type=return_tensors + ) + + +__all__ = ["LlavaOnevisionImageProcessor"] diff --git a/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/modeling_llava_onevision.py b/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/modeling_llava_onevision.py new file mode 100644 index 0000000000000000000000000000000000000000..7bc88ec95ab359ab4a684f7fac75faef5d99fe8e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -0,0 +1,812 @@ +# coding=utf-8 +# Copyright 2024 the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Llava-Onevision model.""" + +import math +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import numpy as np +import torch +import torch.utils.checkpoint +from torch import nn + +from ...activations import ACT2FN +from ...generation import GenerationMixin +from ...image_processing_utils import select_best_resolution +from ...modeling_outputs import ModelOutput +from ...modeling_utils import PreTrainedModel +from ...utils import ( + add_start_docstrings, + logging, +) +from ..auto import AutoModel, AutoModelForCausalLM +from .configuration_llava_onevision import LlavaOnevisionConfig + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "LlavaNextConfig" + + +# Copied from transformers.models.llava_next.modeling_llava_next.get_anyres_image_grid_shape +def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): + """ + Calculate the shape of the image patch grid after the preprocessing for images of any resolution. + + Args: + image_size (`tuple`): + The size of the input image in the format (width, height). + grid_pinpoints (`List`): + A list containing possible resolutions. Each item in the list should be a tuple or list + of the form `(height, width)`. + patch_size (`int`): + The size of each image patch. + + Returns: + tuple: The shape of the image patch grid in the format (width, height). + """ + if not isinstance(grid_pinpoints, list): + raise TypeError("grid_pinpoints should be a list of tuples or lists") + + # ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate + if not isinstance(image_size, (list, tuple)): + if not isinstance(image_size, (torch.Tensor, np.ndarray)): + raise TypeError( + f"image_size invalid type: {type(image_size)} not valid, should be either list, tuple, np.ndarray or tensor" + ) + image_size = image_size.tolist() + + height, width = select_best_resolution(image_size, grid_pinpoints) + return height // patch_size, width // patch_size + + +# Copied from transformers.models.llava_next.modeling_llava_next.image_size_to_num_patches +def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int): + """ + Calculate the number of patches after the preprocessing for images of any resolution. + + Args: + image_size (`torch.LongTensor` or `np.ndarray` or `Tuple[int, int]`): + The size of the input image in the format (height, width). ? + grid_pinpoints (`List`): + A list containing possible resolutions. Each item in the list should be a tuple or list + of the form `(height, width)`. + patch_size (`int`): + The size of each image patch. + + Returns: + int: the number of patches + """ + if not isinstance(grid_pinpoints, list): + raise TypeError("grid_pinpoints should be a list of tuples or lists") + + # ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate + if not isinstance(image_size, (list, tuple)): + if not isinstance(image_size, (torch.Tensor, np.ndarray)): + raise TypeError(f"image_size invalid type {type(image_size)} with value {image_size}") + image_size = image_size.tolist() + + best_resolution = select_best_resolution(image_size, grid_pinpoints) + height, width = best_resolution + num_patches = 0 + # consider change to ceil(height/patch_size)*ceil(width/patch_size) + 1 + for i in range(0, height, patch_size): + for j in range(0, width, patch_size): + num_patches += 1 + # add the base patch + num_patches += 1 + return num_patches + + +# Copied from transformers.models.llava_next.modeling_llava_next.unpad_image +def unpad_image(tensor, original_size): + """ + Unpads a PyTorch tensor of a padded and resized image. + + Args: + tensor (`torch.Tensor`): + The image tensor, assumed to be of shape (num_channels, height, width). + original_size (`tuple`): + The original size of the image (height, width). + + Returns: + `torch.Tensor`: The unpadded image tensor. + """ + if not isinstance(original_size, (list, tuple)): + if not isinstance(original_size, (torch.Tensor, np.ndarray)): + raise TypeError( + f"image_size invalid type: {type(original_size)} not valid, should be either list, tuple, np.ndarray or tensor" + ) + original_size = original_size.tolist() + original_height, original_width = original_size + current_height, current_width = tensor.shape[1:] + + original_aspect_ratio = original_width / original_height + current_aspect_ratio = current_width / current_height + + if original_aspect_ratio > current_aspect_ratio: + scale_factor = current_width / original_width + new_height = int(round(original_height * scale_factor, 7)) + padding = (current_height - new_height) // 2 + unpadded_tensor = tensor[:, padding : current_height - padding, :] + else: + scale_factor = current_height / original_height + new_width = int(round(original_width * scale_factor, 7)) + padding = (current_width - new_width) // 2 + unpadded_tensor = tensor[:, :, padding : current_width - padding] + + return unpadded_tensor + + +@dataclass +# Copied from transformers.models.llava_next_video.modeling_llava_next_video.LlavaNextVideoCausalLMOutputWithPast with LlavaNextVideo->LlavaOnevision +class LlavaOnevisionCausalLMOutputWithPast(ModelOutput): + """ + Base class for LlavaOnevision causal language model (or autoregressive) outputs. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size (batch_size * num_patches, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + + video_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`. + video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ + + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None + past_key_values: Optional[List[torch.FloatTensor]] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[torch.FloatTensor] = None + video_hidden_states: Optional[torch.FloatTensor] = None + + +# Copied from transformers.models.llava.modeling_llava.LlavaMultiModalProjector with Llava->LlavaOnevision +class LlavaOnevisionMultiModalProjector(nn.Module): + def __init__(self, config: LlavaOnevisionConfig): + super().__init__() + self.linear_1 = nn.Linear( + config.vision_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias + ) + self.act = ACT2FN[config.projector_hidden_act] + self.linear_2 = nn.Linear( + config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias + ) + + def forward(self, image_features): + hidden_states = self.linear_1(image_features) + hidden_states = self.act(hidden_states) + hidden_states = self.linear_2(hidden_states) + return hidden_states + + +LLAVA_ONEVISION_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`LlavaNextConfig`] or [`LlavaNextVisionConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare LLaVA-Onevision Model outputting raw hidden-states without any specific head on top.", + LLAVA_ONEVISION_START_DOCSTRING, +) +class LlavaOnevisionPreTrainedModel(PreTrainedModel): + config_class = LlavaOnevisionConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["LlavaOnevisionVisionAttention"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + _supports_cache_class = True + _supports_static_cache = False # Qwen2 doesn't but llava has no reasons to not support + _supports_quantized_cache = True + _supports_sdpa = True + + # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextPreTrainedModel._init_weights + def _init_weights(self, module): + # important: this ported version of LlavaNext isn't meant for training from scratch - only + # inference and fine-tuning - so the proper init weights code has been removed - the original codebase + # https://github.com/haotian-liu/LLaVA/tree/main/llava_next should serve for that purpose + std = ( + self.config.initializer_range + if hasattr(self.config, "initializer_range") + else self.config.text_config.initializer_range + ) + + if hasattr(module, "class_embedding"): + module.class_embedding.data.normal_(mean=0.0, std=std) + + if isinstance(module, (nn.Linear, nn.Conv2d)): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +LLAVA_ONEVISION_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)): + The tensors corresponding to the input images. Pixel values can be obtained using + [`AutoImageProcessor`]. See [`LlavaNextImageProcessor.__call__`] for details. [`LlavaProcessor`] uses + [`LlavaNextImageProcessor`] for processing images. + image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`, *optional*): + The sizes of the images in the batch, being (height, width) for each image. + pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, frames, num_channels, image_size, image_size)): + The tensors corresponding to the input videos. Pixel values can be obtained using + [`LlavaNextVideoProcessor`]. See [`LlavaNextVideoProcessor.__call__`] for details. [`LlavaProcessor`] uses + [`LlavaNextVideoProcessor`] for processing videos. + image_sizes_videos (`torch.LongTensor` of shape `(batch_size, frames, 2)`, *optional*): + The sizes of the videos in the batch, being (height, width) for each frame in the video. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids) + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + vision_feature_layer (`int`, *optional*, defaults to -2): + The index of the layer to select the vision feature. + vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features. + If `"full"`, the full vision features are used. + vision_aspect_ratio (`str`, *optional*, defaults to `"anyres_max_9"`): + Aspect ratio used when processong image features. The default value is "anyres_max_9". + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`, + this tensor is not affected by padding. It is used to update the cache in the correct position and to infer + the complete sequence length. +""" + + +@add_start_docstrings( + """The LLaVA-Onevision model which consists of a vision backbone and a language model.""", + LLAVA_ONEVISION_START_DOCSTRING, +) +class LlavaOnevisionForConditionalGeneration(LlavaOnevisionPreTrainedModel, GenerationMixin): + def __init__(self, config: LlavaOnevisionConfig): + super().__init__(config) + self.vision_tower = AutoModel.from_config(config.vision_config) + + self.multi_modal_projector = LlavaOnevisionMultiModalProjector(config) + embed_std = 1 / math.sqrt(config.text_config.hidden_size) + self.image_newline = nn.Parameter(torch.randn(config.text_config.hidden_size, dtype=self.dtype) * embed_std) + + self.vocab_size = config.text_config.vocab_size + self.language_model = AutoModelForCausalLM.from_config(config.text_config) + self.post_init() + + # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.get_input_embeddings + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.set_input_embeddings + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.get_output_embeddings + def get_output_embeddings(self): + return self.language_model.get_output_embeddings() + + # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.set_output_embeddings + def set_output_embeddings(self, new_embeddings): + self.language_model.set_output_embeddings(new_embeddings) + + # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.set_decoder + def set_decoder(self, decoder): + self.language_model.set_decoder(decoder) + + # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.get_decoder + def get_decoder(self): + return self.language_model.get_decoder() + + # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.tie_weights + def tie_weights(self): + return self.language_model.tie_weights() + + def pack_image_features(self, image_features, image_sizes, image_newline=None, vision_aspect_ratio="anyres_max_9"): + """ + Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors. + + Args: + image_features (`List[torch.Tensor]` of length num_images, each of shape `(num_patches, image_length, embed_dim)`) + List of image feature tensor, each contains all the visual feature of all patches. + image_sizes (`torch.Tensor` of shape `(num_images, 2)`) + Actual image size of each images (H, W). + image_newline (`torch.Tensor` of shape `(embed_dim)`) + New line embedding vector. + vision_aspect_ratio (`str`, *optional*, "anyres_max_9"): + Aspect ratio used when processong image features. The default value is "anyres_max_9". + Returns: + image_features (`torch.Tensor` of shape `(all_feat_len, embed_dim)`) + feature_lens (`List[int]`) + token length of each image in image_features + """ + new_image_features = [] + feature_lens = [] + for image_idx, image_feature in enumerate(image_features): + if image_feature.shape[0] > 1: + base_image_feature = image_feature[0] + image_feature = image_feature[1:] + height = width = self.config.vision_config.image_size // self.config.vision_config.patch_size + if height * width != base_image_feature.shape[0]: + raise ValueError("The number of patches is not consistent with the image size.") + num_patch_height, num_patch_width = get_anyres_image_grid_shape( + image_sizes[image_idx], + self.config.image_grid_pinpoints, + self.config.vision_config.image_size, + ) + image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1) + image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous() + image_feature = image_feature.flatten(1, 2).flatten(2, 3) + image_feature = unpad_image(image_feature, image_sizes[image_idx]) + max_num_patches = int(vision_aspect_ratio.strip("anyres_max_")) + channels, curr_height, curr_width = image_feature.shape + ratio = math.sqrt(curr_height * curr_width / (max_num_patches * height**2)) + if ratio > 1.1: + image_feature = image_feature[None] + image_feature = nn.functional.interpolate( + image_feature, [int(curr_height // ratio), int(curr_width // ratio)], mode="bilinear" + )[0] + if image_newline is not None: + image_feature = torch.cat( + ( + image_feature, + image_newline[:, None, None] + .expand(*image_feature.shape[:-1], 1) + .to(image_feature.device, image_feature.dtype), + ), + dim=-1, + ) + image_feature = image_feature.flatten(1, 2).transpose(0, 1) + image_feature = torch.cat((base_image_feature, image_feature), dim=0) + else: + image_feature = image_feature[0] + if image_newline is not None: + image_feature = torch.cat((image_feature, image_newline[None].to(image_feature)), dim=0) + new_image_features.append(image_feature) + feature_lens.append(image_feature.size(0)) + image_features = torch.cat(new_image_features, dim=0) + feature_lens = torch.tensor(feature_lens, dtype=torch.long, device=image_features.device) + return image_features, feature_lens + + def apply_pooling(self, image_features): + height = width = self.config.vision_config.image_size // self.config.vision_config.patch_size + batch_frames, seq_len, dim = image_features.shape + image_features = image_features.view(batch_frames, height, width, -1) + image_features = image_features.permute(0, 3, 1, 2).contiguous() + + height, width = image_features.shape[2:] + scaled_shape = [math.ceil(height / 2), math.ceil(width / 2)] + image_features = nn.functional.interpolate(image_features, size=scaled_shape, mode="bilinear") + + image_features = image_features.permute(0, 2, 3, 1) + image_features = image_features.view(batch_frames, -1, dim) + return image_features + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + image_sizes: torch.Tensor, + vision_feature_layer: int, + vision_feature_select_strategy: str, + ): + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`) + The tensors corresponding to the input images. + image_sizes (`torch.Tensor` of shape `(num_images, 2)`) + Actual image size of each images (H, W). + vision_feature_layer (`int`): + The index of the layer to select the vision feature. + vision_feature_select_strategy (`str`): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"` + Returns: + image_features (List[`torch.Tensor`]): List of image feature tensor, each contains all the visual feature of all patches + and are of shape `(num_patches, image_length, embed_dim)`). + """ + # ! infer image_num_patches from image_sizes + image_num_patches = [ + image_size_to_num_patches( + image_size=imsize, + grid_pinpoints=self.config.image_grid_pinpoints, + patch_size=self.config.vision_config.image_size, + ) + for imsize in image_sizes + ] + if pixel_values.dim() == 5: + # stacked if input is (batch_size, num_patches, num_channels, height, width) + _pixel_values_list = [pix_val[:num_patch] for pix_val, num_patch in zip(pixel_values, image_num_patches)] + pixel_values = torch.cat(_pixel_values_list, dim=0) + elif pixel_values.dim() != 4: + # otherwise has to be stacked from list of (num_patches, num_channels, height, width) + raise ValueError(f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions") + + image_features = self.vision_tower(pixel_values, output_hidden_states=True) + selected_image_feature = image_features.hidden_states[vision_feature_layer] + if vision_feature_select_strategy == "default": + selected_image_feature = selected_image_feature[:, 1:] + elif vision_feature_select_strategy == "full": + selected_image_feature = selected_image_feature + image_features = self.multi_modal_projector(selected_image_feature) + image_features = torch.split(image_features, image_num_patches, dim=0) + return image_features + + def get_video_features( + self, pixel_values: torch.FloatTensor, vision_feature_layer: int, vision_feature_select_strategy: str + ): + """ + Obtains video last hidden states from the vision tower, apply multimodal projection and pooling. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`) + The tensors corresponding to the input video. + vision_feature_layer (`int`): + The index of the layer to select the vision feature. + vision_feature_select_strategy (`str`): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"` + Returns: + video_features (List[`torch.Tensor`]): List of video feature tensor, each contains all the visual feature of all patches + and are of shape `(num_videos, video_length, embed_dim)`). + """ + batch_size, frames, channels, height, width = pixel_values.shape + pixel_values = pixel_values.view(batch_size * frames, channels, height, width) + video_features = self.vision_tower(pixel_values, output_hidden_states=True) + selected_video_feature = video_features.hidden_states[vision_feature_layer] + + if vision_feature_select_strategy == "default": + selected_video_feature = selected_video_feature[:, 1:] + elif vision_feature_select_strategy == "full": + selected_video_feature = selected_video_feature + video_features = self.multi_modal_projector(selected_video_feature) + + video_features = self.apply_pooling(video_features) + video_features = video_features.reshape(batch_size, frames * video_features.shape[1], -1) + + return video_features + + @add_start_docstrings(LLAVA_ONEVISION_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + pixel_values: torch.FloatTensor = None, + image_sizes: Optional[torch.LongTensor] = None, + pixel_values_videos: torch.FloatTensor = None, + image_sizes_videos: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + vision_feature_layer: Optional[int] = None, + vision_feature_select_strategy: Optional[str] = None, + vision_aspect_ratio: Optional[str] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + num_logits_to_keep: int = 0, + ) -> Union[Tuple, LlavaOnevisionCausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + num_logits_to_keep (`int`, *optional*): + Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all + `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that + token can save memory, which becomes pretty significant for long sequences or large vocabulary size. + + + Returns: + [`~LlavaOnevisionCausalLMOutputWithPast`] (if `return_dict=True`) or a `tuple`. + + Example: + + ```python + >>> from PIL import Image + >>> import requests + >>> import torch + >>> from transformers import LlavaOnevisionProcessor, LlavaOnevisionForConditionalGeneration + + >>> model = LlavaOnevisionForConditionalGeneration.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf", torch_dtype="float16", device_map="cuda:0") + >>> processor = LlavaOnevisionProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf") + + >>> conversation = [ + ... { + ... "role": "user", + ... "content": [ + ... {"type": "text", "text": "What is shown in this image?"}, + ... {"type": "image"}, + ... ], + ... }, + ... ] + >>> prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) + + >>> image_file = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> raw_image = Image.open(requests.get(image_file, stream=True).raw) + >>> inputs = processor(text=prompt, images=raw_image, return_tensors='pt').to(0, torch.float16) + + >>> output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + >>> processor.batch_decode(output, skip_special_tokens=True)[0] + "user\n\nWhat is shown in this image?\nassistant\ncat" + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + vision_feature_layer = ( + vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer + ) + vision_feature_select_strategy = ( + vision_feature_select_strategy + if vision_feature_select_strategy is not None + else self.config.vision_feature_select_strategy + ) + vision_aspect_ratio = ( + vision_aspect_ratio if vision_aspect_ratio is not None else self.config.vision_aspect_ratio + ) + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None: + raise ValueError( + "You cannot specify both `pixel_values`/`pixel_values_videos` and `inputs_embeds` at the same time, " + "and must specify either one" + ) + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + # Images are processed with Anyres + if pixel_values is not None: + image_features = self.get_image_features( + pixel_values, + image_sizes, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + ) + image_features, feature_lens = self.pack_image_features( + image_features, + image_sizes, + image_newline=self.image_newline, + vision_aspect_ratio=vision_aspect_ratio, + ) + n_image_tokens = (input_ids == self.config.image_token_index).sum().item() + n_image_features = image_features.shape[0] + + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + special_image_mask = ( + (input_ids == self.config.image_token_index) + .unsqueeze(-1) + .expand_as(inputs_embeds) + .to(inputs_embeds.device) + ) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + # Video are simply embedded and further pooled to decrease seq len + if pixel_values_videos is not None: + video_features = self.get_video_features( + pixel_values_videos, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + ) + image_newline = ( + self.image_newline[None, None, :].repeat(video_features.shape[0], 1, 1).to(video_features.device) + ) + video_features = torch.cat((video_features, image_newline), dim=1) + video_features = video_features.flatten(0, 1) + + n_video_tokens = (input_ids == self.config.video_token_index).sum().item() + n_video_features = video_features.shape[0] + if n_video_tokens != n_video_features: + raise ValueError( + f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}" + ) + special_video_mask = ( + (input_ids == self.config.video_token_index) + .unsqueeze(-1) + .expand_as(inputs_embeds) + .to(inputs_embeds.device) + ) + video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_video_mask, video_features) + + outputs = self.language_model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + num_logits_to_keep=num_logits_to_keep, + ) + + logits = outputs[0] + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + if attention_mask is not None: + # we use the input attention mask to shift the logits and labels, because it is 2D. + # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft + shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device) + shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous() + shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous() + else: + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct( + shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device) + ) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return LlavaOnevisionCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, + video_hidden_states=video_features if pixel_values_videos is not None else None, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + image_sizes=None, + pixel_values_videos=None, + image_sizes_videos=None, + attention_mask=None, + cache_position=None, + num_logits_to_keep=None, + **kwargs, + ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + + model_inputs = self.language_model.prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + num_logits_to_keep=num_logits_to_keep, + **kwargs, + ) + + if cache_position[0] == 0: + # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore + # Otherwise we need pixel values to be passed to model + model_inputs["pixel_values"] = pixel_values + model_inputs["image_sizes"] = image_sizes + model_inputs["pixel_values_videos"] = pixel_values_videos + model_inputs["image_sizes_videos"] = image_sizes_videos + + return model_inputs + + +__all__ = ["LlavaOnevisionForConditionalGeneration", "LlavaOnevisionPreTrainedModel"] diff --git a/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/processing_llava_onevision.py b/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/processing_llava_onevision.py new file mode 100644 index 0000000000000000000000000000000000000000..d42f287da25b22146bc4e7888776c262bea5c7a0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/models/llava_onevision/processing_llava_onevision.py @@ -0,0 +1,319 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processor class for LLaVa-Onevision. +""" + +import math +import os +from typing import Iterable, List, Union + +from ...feature_extraction_utils import BatchFeature +from ...image_processing_utils import select_best_resolution +from ...image_utils import ImageInput, VideoInput, get_image_size, to_numpy_array +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils import logging +from ..auto import AutoImageProcessor + + +logger = logging.get_logger(__name__) + + +class LlavaOnevisionProcessorKwargs(ProcessingKwargs, total=False): + # see processing_utils.ProcessingKwargs documentation for usage. + _defaults = { + "text_kwargs": { + "padding": False, + }, + "image_kwargs": {}, + "video_kwargs": {}, + } + + +class LlavaOnevisionProcessor(ProcessorMixin): + r""" + Constructs a LLaVa-Onevision processor which wraps a LLaVa-Onevision video processor, LLaVa-NeXT image processor and a LLaMa tokenizer into a single processor. + + [`LlavaNextProcessor`] offers all the functionalities of [`LlavaOnevisionVideoProcessor`], [`LlavaOnevisionImageProcessor`] and [`LlamaTokenizerFast`]. See the + [`~LlavaOnevisionVideoProcessor.__call__`], [`~LlavaNextProcessor.__call__`] and [`~LlavaNextProcessor.decode`] for more information. + + Args: + image_processor ([`LlavaOnevisionImageProcessor`], *optional*): + The image processor is a required input. + tokenizer ([`LlamaTokenizerFast`], *optional*): + The tokenizer is a required input. + video_processor ([`LlavaOnevisionVideoProcessor`], *optional*): + The video processor is a required input. + num_image_tokens (`int`, *optional*): + Number of image tokens for one imagethat will be returned by vision tower. + vision_feature_select_strategy (`str`, *optional*): + The feature selection strategy used to select the vision feature from the vision backbone. + Shoudl be same as in model's config + chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. + image_token (`str`, *optional*, defaults to `""`): + Special token used to denote image location. + video_token (`str`, *optional*, defaults to `"