diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b8e87b4a137aa65b8a72a5c7d01b953929bfb628
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/activations.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/activations.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd855bfac044fe2c98c9012429fb6f9f3e319f45
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/activations.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/activations_tf.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/activations_tf.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7332e4a718d08c3b50d33a721d705a0008425c0f
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/activations_tf.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/audio_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/audio_utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0ef29f48db46d6e01a09e343bb481f55c0005e6a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/audio_utils.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/cache_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/cache_utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5fb3988bda895c1cc595178a79ec38f65e6198b7
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/cache_utils.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/configuration_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/configuration_utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0300af3d7904cb8d5895d0a182befd631f612ddd
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/configuration_utils.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/convert_graph_to_onnx.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/convert_graph_to_onnx.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4f2cf06b1beac93b60f3504d74941cac52795be2
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/convert_graph_to_onnx.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/convert_slow_tokenizer.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/convert_slow_tokenizer.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..12403d6eb80e7033e73a8df1d9a5cd8e45fc1c67
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/convert_slow_tokenizer.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/convert_slow_tokenizers_checkpoints_to_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/convert_slow_tokenizers_checkpoints_to_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1bf5d62e942d0f943af67694dfdeebd72b9a5cc6
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/convert_slow_tokenizers_checkpoints_to_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/convert_tf_hub_seq_to_seq_bert_to_pytorch.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/convert_tf_hub_seq_to_seq_bert_to_pytorch.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c2de23088dc4f2d30404399e106707ccab540396
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/convert_tf_hub_seq_to_seq_bert_to_pytorch.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/debug_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/debug_utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7d69010a487542bd168dcc67427d1c616cb16eac
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/debug_utils.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/dependency_versions_check.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/dependency_versions_check.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..35d316aecd9abd49bf0b45d8c0e74a89a836dccf
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/dependency_versions_check.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/dependency_versions_table.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/dependency_versions_table.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0d0f3f76f523140a25f0528470174cc58cec5424
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/dependency_versions_table.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/dynamic_module_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/dynamic_module_utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..557b10f21644471a17d7328a5724ae54efa35d43
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/dynamic_module_utils.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/feature_extraction_sequence_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/feature_extraction_sequence_utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7cdf433c024152d00847d137956ec4920e0508ad
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/feature_extraction_sequence_utils.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/feature_extraction_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/feature_extraction_utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2de0c8c75c3dcb605d2616f83ee29510a812c55d
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/feature_extraction_utils.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/file_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/file_utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d6f182409798d0572c42c511f9cc5590274fbf8b
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/file_utils.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/hf_argparser.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/hf_argparser.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ff9e3e0b8e40fe69d8c8d81f12167d2593e682b5
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/hf_argparser.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/hyperparameter_search.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/hyperparameter_search.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e084c4dae40cdeecf047cf37882259fd3dc5f40
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/hyperparameter_search.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/image_processing_base.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/image_processing_base.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..954fde590d49eea739aa25c82d3974eaf9f25308
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/image_processing_base.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/image_processing_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/image_processing_utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e8a2167dd72a07a49dc171533c1ab0979f2cc8e
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/image_processing_utils.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/image_processing_utils_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/image_processing_utils_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0474fe81ac948426d15aeb01d58e497edece678b
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/image_processing_utils_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/image_transforms.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/image_transforms.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1055323545f8129aac4ccfbdb5c457a077ab67cd
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/image_transforms.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/image_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/image_utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f813a26cc6423e5205e7373ed1c01dee46518580
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/image_utils.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/keras_callbacks.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/keras_callbacks.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..96d8463dda99ea58a949d427cefa08a7b45906e9
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/keras_callbacks.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/masking_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/masking_utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6815f0077cb9303f2a7f1efe9ac366c956e16667
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/masking_utils.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/model_debugging_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/model_debugging_utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d94567e5ca30258dc158f5fe87896f6ce1fadb7
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/model_debugging_utils.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/modelcard.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/modelcard.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..013c00b9d377dba5b5bbdd75a3ecf5de105aef21
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/modelcard.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_attn_mask_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_attn_mask_utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1be555d2aaa4fb976ccb9d00191c27bfba84e92
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_attn_mask_utils.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_flash_attention_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_flash_attention_utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..95f2614929edb90fc51afe1ff0bff08967c9b07c
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_flash_attention_utils.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_flax_outputs.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_flax_outputs.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e4f8801f82aed477c4ed04aa1c357a9fd34c4cec
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_flax_outputs.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_flax_pytorch_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_flax_pytorch_utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a8ac6c2eaadba3e02bf41d07c260726586f1b42e
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_flax_pytorch_utils.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_flax_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_flax_utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..00ce6222c036a09c2aefc2ba359b6736f092f627
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_flax_utils.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_gguf_pytorch_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_gguf_pytorch_utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b4b21a43430e2e0734a66e59c82069fe45af30c5
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_gguf_pytorch_utils.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_layers.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_layers.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f67692320f5e33592a134f833ac04394553166a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_layers.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_rope_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_rope_utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..01235171338fdf419ddcb61ab4327c4557d9b525
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_rope_utils.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_tf_outputs.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_tf_outputs.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..10c980a59acdfe93bbfb46f24b7eb8fbd5493d97
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_tf_outputs.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_tf_pytorch_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_tf_pytorch_utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4fe4ea646fffacb6c7584a1b73fb62471d58c520
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_tf_pytorch_utils.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/optimization.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/optimization.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ceb3e6c34d9fc581c6f7e5fe7f3e149411fa77f
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/optimization.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/optimization_tf.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/optimization_tf.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7371fe17471feb4cd6e304bad011eb1aea384088
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/optimization_tf.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/processing_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/processing_utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cd2577439496fdec543765925b257a3738ec88d4
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/processing_utils.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/pytorch_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/pytorch_utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5fddd62f11c9b8d736f7c09c3f2f0c88dfacc669
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/pytorch_utils.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/safetensors_conversion.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/safetensors_conversion.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..783ed488a72e16c1fe99f9240e33dfa140ac17d5
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/safetensors_conversion.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/tf_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/tf_utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4d9194ee8bd6d0582bc4fcf7b09c993d0c91135d
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/tf_utils.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/time_series_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/time_series_utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eba50ba342e8798c653eb1fb5756cdc8de88d068
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/time_series_utils.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/tokenization_mistral_common.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/tokenization_mistral_common.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b892f2b57b0c5c5ceb5221a8779352e8b3264fda
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/tokenization_mistral_common.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/tokenization_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/tokenization_utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..27e8a48e747c9113a7b6aafda4a06405146b5694
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/tokenization_utils.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/tokenization_utils_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/tokenization_utils_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..376ad50ab6727c7099a5affa5de5c02bdf8930db
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/tokenization_utils_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/trainer_callback.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/trainer_callback.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..27ba81e79d7ff0b2bd86f412e58625c42cf27cbf
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/trainer_callback.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/trainer_pt_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/trainer_pt_utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..98c248a3086440a0535f003fc3763452d6901cf4
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/trainer_pt_utils.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/trainer_seq2seq.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/trainer_seq2seq.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec40f23d493317bb1e8f5d5e3ad17318f805159e
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/trainer_seq2seq.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/trainer_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/trainer_utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b5e7e8432ca894efaffe344b8cbabac14139858
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/trainer_utils.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/training_args_seq2seq.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/training_args_seq2seq.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..092fa244d75df8a56640b496ede2bbdd9978e182
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/training_args_seq2seq.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/training_args_tf.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/training_args_tf.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aeec503e387fc41754b21dcfa715eef51d35ca2a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/training_args_tf.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/video_processing_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/video_processing_utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e804db12caa0a0f6cc0e711a6ca0fd5c25fa2aca
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/video_processing_utils.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/video_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/video_utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cabfa90a3d46175c70e525951da889949df1f629
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/video_utils.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/commands/__init__.py b/venv/lib/python3.13/site-packages/transformers/commands/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa5d95a85b538171ec9cf4fa16e892df1efdef6b
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/commands/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from argparse import ArgumentParser
+
+
+class BaseTransformersCLICommand(ABC):
+    @staticmethod
+    @abstractmethod
+    def register_subcommand(parser: ArgumentParser):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def run(self):
+        raise NotImplementedError()
diff --git a/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d2cfa28ca1cdce05d27964c83d69078cf64a115a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/add_fast_image_processor.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/add_fast_image_processor.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ffe6f0dac500dbe809b819e11b88eb637fe775b
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/add_fast_image_processor.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/add_new_model_like.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/add_new_model_like.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..42bec93cc9bc2f292f10a427524f48fbba9addca
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/add_new_model_like.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/chat.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/chat.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b8e0860e0272d8d32cef586a49c9ffd7d46866e
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/chat.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/convert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/convert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e55289ea527ec95b3b3d9670d1dbe470177a6592
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/convert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/download.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/download.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..84917057dcafc408a603fafff801f116bfff920c
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/download.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/env.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/env.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c148d33ec0cf8aa881eb3dc6c5a381bad907f29e
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/env.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/run.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/run.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..42a03537b2f8eaee0535bc2aed9ea224c6bc2176
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/run.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/serving.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/serving.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e8c6a7d1d448bac755663f7a7c965598aa78342b
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/serving.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/train.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/train.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3506030f35ea048ec019a07ef2c770a558ff39d6
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/train.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/transformers_cli.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/transformers_cli.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c7e1c473f803f80e023d06fe2de930484c4ea02e
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/transformers_cli.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/commands/add_fast_image_processor.py b/venv/lib/python3.13/site-packages/transformers/commands/add_fast_image_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..90c911525f7773a34ca9f97f180d5b127649f3c2
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/commands/add_fast_image_processor.py
@@ -0,0 +1,530 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+from argparse import ArgumentParser, Namespace
+from datetime import date
+from pathlib import Path
+
+from ..utils import logging
+from . import BaseTransformersCLICommand
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+CURRENT_YEAR = date.today().year
+TRANSFORMERS_PATH = Path(__file__).parent.parent
+REPO_PATH = TRANSFORMERS_PATH.parent.parent
+
+
+def add_fast_image_processor_to_model_init(
+    fast_image_processing_module_file: str, fast_image_processor_name, model_name: str
+):
+    """
+    Add the fast image processor to the __init__.py file of the model.
+    """
+    with open(TRANSFORMERS_PATH / "models" / model_name / "__init__.py", "r", encoding="utf-8") as f:
+        content = f.read()
+
+    fast_image_processing_module_file = fast_image_processing_module_file.split(os.sep)[-1].replace(".py", "")
+
+    if "import *" in content:
+        # we have an init file in the updated format
+        # get the indented block after if TYPE_CHECKING: and before else:, append the new import, sort the imports and write the updated content
+        # Step 1: Find the block
+        block_regex = re.compile(
+            r"if TYPE_CHECKING:\n(?P<if_block>.*?)(?=\s*else:)",
+            re.DOTALL,
+        )
+        match = block_regex.search(content)
+
+        if not match:
+            raise ValueError("Couldn't find the 'if TYPE_CHECKING' block.")
+
+        block_content = match.group("if_block")  # The captured import block
+
+        # Step 2: Parse existing entries
+        entries = block_content.split("\n")
+        indent = " " * (len(entries[0]) - len(entries[0].lstrip()))
+        new_entry = f"{indent}from .{fast_image_processing_module_file} import *"
+        if new_entry not in entries:
+            entries.append(new_entry)
+        entries.sort()
+        updated_block = "\n".join(entry for entry in entries)
+
+        # Replace the original block in the content
+        updated_content = content[: match.start("if_block")] + updated_block + content[match.end("if_block") :]
+    else:
+        # we have an init file in the old format
+
+        # add "is_torchvision_available" import to from ...utils import (
+        # Regex to match import statements from transformers.utils
+        pattern = r"""
+            from\s+\.\.\.utils\s+import\s+
+            (?:                                   # Non-capturing group for either:
+                ([\w, ]+)                         # 1. Single-line imports (e.g., 'a, b')
+                |                                 # OR
+                \((.*?)\)                         # 2. Multi-line imports (e.g., '(a, ... b)')
+            )
+        """
+        regex = re.compile(pattern, re.VERBOSE | re.DOTALL)
+
+        def replacement_function(match):
+            # Extract existing imports
+            imports = (match.group(1) or match.group(2)).split(",")
+            imports = imports[:-1] if imports[-1] == "\n" else imports
+            imports = [imp.strip() for imp in imports]
+
+            # Add the new import if not already present
+            if "is_torchvision_available" not in imports:
+                imports.append("is_torchvision_available")
+                imports.sort()
+
+            # Convert to multi-line import in all cases
+            updated_imports = "(\n    " + ",\n    ".join(imports) + ",\n)"
+
+            return f"from ...utils import {updated_imports}"
+
+        # Replace all matches in the file content
+        updated_content = regex.sub(replacement_function, content)
+
+        vision_import_structure_block = f'    _import_structure["{fast_image_processing_module_file[:-5]}"] = ["{fast_image_processor_name[:-4]}"]\n'
+
+        added_import_structure_block = (
+            "try:\n    if not is_torchvision_available():\n"
+            "        raise OptionalDependencyNotAvailable()\n"
+            "except OptionalDependencyNotAvailable:\n"
+            "    pass\n"
+            "else:\n"
+            f'    _import_structure["{fast_image_processing_module_file}"] = ["{fast_image_processor_name}"]\n'
+        )
+
+        if vision_import_structure_block not in updated_content:
+            raise ValueError("Couldn't find the 'vision _import_structure block' block.")
+
+        if added_import_structure_block not in updated_content:
+            updated_content = updated_content.replace(
+                vision_import_structure_block, vision_import_structure_block + "\n" + added_import_structure_block
+            )
+
+        vision_import_statement_block = (
+            f"        from .{fast_image_processing_module_file[:-5]} import {fast_image_processor_name[:-4]}\n"
+        )
+
+        added_import_statement_block = (
+            "    try:\n        if not is_torchvision_available():\n"
+            "            raise OptionalDependencyNotAvailable()\n"
+            "    except OptionalDependencyNotAvailable:\n"
+            "        pass\n"
+            "    else:\n"
+            f"        from .{fast_image_processing_module_file} import {fast_image_processor_name}\n"
+        )
+
+        if vision_import_statement_block not in updated_content:
+            raise ValueError("Couldn't find the 'vision _import_structure block' block.")
+
+        if added_import_statement_block not in updated_content:
+            updated_content = updated_content.replace(
+                vision_import_statement_block, vision_import_statement_block + "\n" + added_import_statement_block
+            )
+
+    # write the updated content
+    with open(TRANSFORMERS_PATH / "models" / model_name / "__init__.py", "w", encoding="utf-8") as f:
+        f.write(updated_content)
+
+
+def add_fast_image_processor_to_auto(image_processor_name: str, fast_image_processor_name: str):
+    """
+    Add the fast image processor to the auto module.
+    """
+    with open(TRANSFORMERS_PATH / "models" / "auto" / "image_processing_auto.py", "r", encoding="utf-8") as f:
+        content = f.read()
+
+    # get all lines containing the image processor name
+    updated_content = content.replace(
+        f'("{image_processor_name}",)', f'("{image_processor_name}", "{fast_image_processor_name}")'
+    )
+
+    # write the updated content
+    with open(TRANSFORMERS_PATH / "models" / "auto" / "image_processing_auto.py", "w", encoding="utf-8") as f:
+        f.write(updated_content)
+
+
+def add_fast_image_processor_to_doc(fast_image_processor_name: str, model_name: str):
+    """
+    Add the fast image processor to the model's doc file.
+    """
+    doc_source = REPO_PATH / "docs" / "source"
+    # find the doc files
+    doc_files = list(doc_source.glob(f"*/model_doc/{model_name}.md"))
+    if not doc_files:
+        # try again with "-"
+        doc_files = list(doc_source.glob(f"*/model_doc/{model_name.replace('_', '-')}.md"))
+    if not doc_files:
+        raise ValueError(f"No doc files found for {model_name}")
+
+    base_doc_string = (
+        f"## {fast_image_processor_name[:-4]}\n\n[[autodoc]] {fast_image_processor_name[:-4]}\n    - preprocess"
+    )
+    fast_doc_string = f"## {fast_image_processor_name}\n\n[[autodoc]] {fast_image_processor_name}\n    - preprocess"
+
+    for doc_file in doc_files:
+        with open(doc_file, "r", encoding="utf-8") as f:
+            content = f.read()
+
+        if fast_doc_string not in content:
+            # add the fast image processor to the doc
+            updated_content = content.replace(
+                base_doc_string,
+                base_doc_string + "\n\n" + fast_doc_string,
+            )
+
+            # write the updated content
+            with open(doc_file, "w", encoding="utf-8") as f:
+                f.write(updated_content)
+
+
+def add_fast_image_processor_to_tests(fast_image_processor_name: str, model_name: str):
+    """
+    Add the fast image processor to the image processing tests.
+    """
+    tests_path = REPO_PATH / "tests" / "models" / model_name
+    test_file = tests_path / f"test_image_processing_{model_name}.py"
+    if not os.path.exists(test_file):
+        logger.warning(f"No test file found for {model_name}. Skipping.")
+        return
+
+    with open(test_file, "r", encoding="utf-8") as f:
+        content = f.read()
+
+    # add is_torchvision_available import to the imports
+    # Regex to match import statements from transformers.utils
+    pattern = r"""
+        from\s+transformers\.utils\s+import\s+
+        (?:                                   # Non-capturing group for either:
+            ([\w, ]+)                         # 1. Single-line imports (e.g., 'a, b')
+            |                                 # OR
+            \((.*?)\)                         # 2. Multi-line imports (e.g., '(a, ... b)')
+        )
+    """
+    regex = re.compile(pattern, re.VERBOSE | re.DOTALL)
+
+    def replacement_function(match):
+        # Extract existing imports
+        existing_imports = (match.group(1) or match.group(2)).split(",")
+        existing_imports = existing_imports[:-1] if existing_imports[-1] == "\n" else existing_imports
+        existing_imports = [imp.strip() for imp in existing_imports]
+
+        # Add the new import if not already present
+        if "is_torchvision_available" not in existing_imports:
+            existing_imports.append("is_torchvision_available")
+            existing_imports.sort()
+
+        # Rebuild the import statement
+        if match.group(1):  # Single-line import
+            updated_imports = ", ".join(existing_imports)
+        else:  # Multi-line import
+            updated_imports = "(\n    " + ",\n    ".join(existing_imports) + ",\n)"
+
+        return f"from transformers.utils import {updated_imports}"
+
+    # Replace all matches in the file content
+    updated_content = regex.sub(replacement_function, content)
+
+    # add the fast image processor to the imports
+    base_import_string = f"    from transformers import {fast_image_processor_name[:-4]}"
+    fast_import_string = (
+        f"    if is_torchvision_available():\n        from transformers import {fast_image_processor_name}"
+    )
+    if fast_import_string not in updated_content:
+        updated_content = updated_content.replace(base_import_string, base_import_string + "\n\n" + fast_import_string)
+
+    # get line starting with "    image_processing_class = " and add a line after it starting with "    fast_image_processing_class = "
+    image_processing_class_line = re.search(r"    image_processing_class = .*", updated_content)
+    if not image_processing_class_line:
+        logger.warning(f"Couldn't find the 'image_processing_class' line in {test_file}. Skipping.")
+        return
+
+    fast_image_processing_class_line = (
+        f"    fast_image_processing_class = {fast_image_processor_name} if is_torchvision_available() else None"
+    )
+    if "    fast_image_processing_class = " not in updated_content:
+        updated_content = updated_content.replace(
+            image_processing_class_line.group(0),
+            image_processing_class_line.group(0) + "\n" + fast_image_processing_class_line,
+        )
+
+    # write the updated content
+    with open(test_file, "w", encoding="utf-8") as f:
+        f.write(updated_content)
+
+
+def get_fast_image_processing_content_header(content: str) -> str:
+    """
+    Get the header of the slow image processor file.
+    """
+    # get all the commented lines at the beginning of the file
+    content_header = re.search(r"^# coding=utf-8\n(#[^\n]*\n)*", content, re.MULTILINE)
+    if not content_header:
+        logger.warning("Couldn't find the content header in the slow image processor file. Using a default header.")
+        return (
+            f"# coding=utf-8\n"
+            f"# Copyright {CURRENT_YEAR} The HuggingFace Team. All rights reserved.\n"
+            f"#\n"
+            f'# Licensed under the Apache License, Version 2.0 (the "License");\n'
+            f"# you may not use this file except in compliance with the License.\n"
+            f"# You may obtain a copy of the License at\n"
+            f"#\n"
+            f"#     http://www.apache.org/licenses/LICENSE-2.0\n"
+            f"#\n"
+            f"# Unless required by applicable law or agreed to in writing, software\n"
+            f'# distributed under the License is distributed on an "AS IS" BASIS,\n'
+            f"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n"
+            f"# See the License for the specific language governing permissions and\n"
+            f"# limitations under the License.\n"
+            f"\n"
+        )
+    content_header = content_header.group(0)
+    # replace the year in the copyright
+    content_header = re.sub(r"# Copyright (\d+)\s", f"# Copyright {CURRENT_YEAR} ", content_header)
+    # get the line starting with """Image processor in content if it exists
+    match = re.search(r'^"""Image processor.*$', content, re.MULTILINE)
+    if match:
+        content_header += match.group(0).replace("Image processor", "Fast Image processor")
+
+    return content_header
+
+
+def write_default_fast_image_processor_file(
+    fast_image_processing_module_file: str, fast_image_processor_name: str, content_base_file: str
+):
+    """
+    Write a default fast image processor file. Used when encountering a problem while parsing the slow image processor file.
+    """
+    imports = "\n\nfrom ...image_processing_utils_fast import BaseImageProcessorFast\n\n\n"
+    content_header = get_fast_image_processing_content_header(content_base_file)
+    content_base_file = (
+        f"class {fast_image_processor_name}(BaseImageProcessorFast):\n"
+        "    # To be implemented\n"
+        "    resample = None\n"
+        "    image_mean = None\n"
+        "    image_std = None\n"
+        "    size = None\n"
+        "    default_to_square = None\n"
+        "    crop_size = None\n"
+        "    do_resize = None\n"
+        "    do_center_crop = None\n"
+        "    do_rescale = None\n"
+        "    do_normalize = None\n"
+        "    do_convert_rgb = None\n\n\n"
+        f'__all__ = ["{fast_image_processor_name}"]\n'
+    )
+
+    content = content_header + imports + content_base_file
+
+    with open(fast_image_processing_module_file, "w", encoding="utf-8") as f:
+        f.write(content)
+
+
+def add_fast_image_processor_file(
+    fast_image_processing_module_file: str, fast_image_processor_name: str, content_base_file: str
+):
+    """
+    Add the fast image processor file to the model's folder.
+    """
+    # if the file already exists, do nothing
+    if os.path.exists(fast_image_processing_module_file):
+        print(f"{fast_image_processing_module_file} already exists. Skipping.")
+        return
+
+    regex = rf"class {fast_image_processor_name[:-4]}.*?(\n\S|$)"
+    match = re.search(regex, content_base_file, re.DOTALL)
+    if not match:
+        print(f"Couldn't find the {fast_image_processor_name[:-4]} class in {fast_image_processing_module_file}")
+        print("Creating a new file with the default content.")
+        return write_default_fast_image_processor_file(
+            fast_image_processing_module_file, fast_image_processor_name, content_base_file
+        )
+    # Exclude the last unindented line
+    slow_class_content = match.group(0).rstrip()
+    # get default args:
+    # find the __init__ block which start with def __init__ and ends with def
+    match = re.search(r"def __init__.*?def ", slow_class_content, re.DOTALL)
+    if not match:
+        print(
+            f"Couldn't find the __init__ block for {fast_image_processor_name[:-4]} in {fast_image_processing_module_file}"
+        )
+        print("Creating a new file with the default content.")
+        return write_default_fast_image_processor_file(
+            fast_image_processing_module_file, fast_image_processor_name, content_base_file
+        )
+    init = match.group(0)
+    init_signature_block = init.split(")")[0]
+    arg_names = init_signature_block.split(":")
+    arg_names = [arg_name.split("\n")[-1].strip() for arg_name in arg_names]
+    # get the default values
+    default_args = re.findall(r"= (.*?)(?:,|\))", init_signature_block)
+
+    # build default args dict
+    default_args_dict = dict(zip(arg_names, default_args))
+    pattern_default_size = r"size = size if size is not None else\s+(.*)"
+    match_default_size = re.findall(pattern_default_size, init)
+    default_args_dict["size"] = match_default_size[0] if match_default_size else None
+    pattern_default_crop_size = r"crop_size = crop_size if crop_size is not None else\s+(.*)"
+    match_default_crop_size = re.findall(pattern_default_crop_size, init)
+    default_args_dict["crop_size"] = match_default_crop_size[0] if match_default_crop_size else None
+    pattern_default_image_mean = r"self.image_mean = image_mean if image_mean is not None else\s+(.*)"
+    match_default_image_mean = re.findall(pattern_default_image_mean, init)
+    default_args_dict["image_mean"] = match_default_image_mean[0] if match_default_image_mean else None
+    pattern_default_image_std = r"self.image_std = image_std if image_std is not None else\s+(.*)"
+    match_default_image_std = re.findall(pattern_default_image_std, init)
+    default_args_dict["image_std"] = match_default_image_std[0] if match_default_image_std else None
+    default_args_dict["default_to_square"] = False if "(size, default_to_square=False" in init else None
+
+    content_header = get_fast_image_processing_content_header(content_base_file)
+    content_base_file = (
+        f"@auto_docstring\n"
+        f"class {fast_image_processor_name}(BaseImageProcessorFast):\n"
+        "    # This generated class can be used as a starting point for the fast image processor.\n"
+        "    # if the image processor is only used for simple augmentations, such as resizing, center cropping, rescaling, or normalizing,\n"
+        "    # only the default values should be set in the class.\n"
+        "    # If the image processor requires more complex augmentations, methods from BaseImageProcessorFast can be overridden.\n"
+        "    # In most cases, only the `_preprocess` method should be overridden.\n\n"
+        "    # For an example of a fast image processor requiring more complex augmentations, see `LlavaNextImageProcessorFast`.\n\n"
+        "    # Default values should be checked against the slow image processor\n"
+        "    # None values left after checking can be removed\n"
+        f"    resample = {default_args_dict.get('resample')}\n"
+        f"    image_mean = {default_args_dict.get('image_mean')}\n"
+        f"    image_std = {default_args_dict.get('image_std')}\n"
+        f"    size = {default_args_dict.get('size')}\n"
+        f"    default_to_square = {default_args_dict.get('default_to_square')}\n"
+        f"    crop_size = {default_args_dict.get('crop_size')}\n"
+        f"    do_resize = {default_args_dict.get('do_resize')}\n"
+        f"    do_center_crop = {default_args_dict.get('do_center_crop')}\n"
+        f"    do_rescale = {default_args_dict.get('do_rescale')}\n"
+        f"    do_normalize = {default_args_dict.get('do_normalize')}\n"
+        f"    do_convert_rgb = {default_args_dict.get('do_convert_rgb')}\n\n\n"
+        f'__all__ = ["{fast_image_processor_name}"]\n'
+    )
+
+    imports = "\n\nfrom ...image_processing_utils_fast import BaseImageProcessorFast\n"
+    image_utils_imports = []
+    if default_args_dict.get("resample") is not None and "PILImageResampling" in default_args_dict.get("resample"):
+        image_utils_imports.append("PILImageResampling")
+    if default_args_dict.get("image_mean") is not None and not any(
+        char.isdigit() for char in default_args_dict.get("image_mean")
+    ):
+        image_utils_imports.append(default_args_dict.get("image_mean"))
+    if default_args_dict.get("image_std") is not None and not any(
+        char.isdigit() for char in default_args_dict.get("image_std")
+    ):
+        image_utils_imports.append(default_args_dict.get("image_std"))
+
+    if image_utils_imports:
+        # sort imports
+        image_utils_imports.sort()
+        imports += f"from ...image_utils import {', '.join(image_utils_imports)}\n"
+
+    imports += "from ...utils import auto_docstring\n"
+
+    content = content_header + imports + "\n\n" + content_base_file
+
+    with open(fast_image_processing_module_file, "w", encoding="utf-8") as f:
+        f.write(content)
+
+
+def add_fast_image_processor(model_name: str):
+    """
+    Add the necessary references to the fast image processor in the transformers package,
+    and create the fast image processor file in the model's folder.
+    """
+    model_module = TRANSFORMERS_PATH / "models" / model_name
+    image_processing_module_file = list(model_module.glob("image_processing*.py"))
+    if not image_processing_module_file:
+        raise ValueError(f"No image processing module found in {model_module}")
+    elif len(image_processing_module_file) > 1:
+        for file_name in image_processing_module_file:
+            if not str(file_name).endswith("_fast.py"):
+                image_processing_module_file = str(file_name)
+                break
+    else:
+        image_processing_module_file = str(image_processing_module_file[0])
+
+    with open(image_processing_module_file, "r", encoding="utf-8") as f:
+        content_base_file = f.read()
+
+    # regex to find object starting with "class " and ending with "ImageProcessor", including "ImageProcessor" in the match
+    image_processor_name = re.findall(r"class (\w*ImageProcessor)", content_base_file)
+    if not image_processor_name:
+        raise ValueError(f"No ImageProcessor class found in {image_processing_module_file}")
+    elif len(image_processor_name) > 1:
+        raise ValueError(f"Multiple ImageProcessor classes found in {image_processing_module_file}")
+
+    image_processor_name = image_processor_name[0]
+    fast_image_processor_name = image_processor_name + "Fast"
+    fast_image_processing_module_file = image_processing_module_file.replace(".py", "_fast.py")
+
+    print(f"Adding {fast_image_processor_name} to {fast_image_processing_module_file}")
+
+    add_fast_image_processor_to_model_init(
+        fast_image_processing_module_file=fast_image_processing_module_file,
+        fast_image_processor_name=fast_image_processor_name,
+        model_name=model_name,
+    )
+
+    add_fast_image_processor_to_auto(
+        image_processor_name=image_processor_name,
+        fast_image_processor_name=fast_image_processor_name,
+    )
+
+    add_fast_image_processor_to_doc(
+        fast_image_processor_name=fast_image_processor_name,
+        model_name=model_name,
+    )
+
+    add_fast_image_processor_to_tests(
+        fast_image_processor_name=fast_image_processor_name,
+        model_name=model_name,
+    )
+
+    add_fast_image_processor_file(
+        fast_image_processing_module_file=fast_image_processing_module_file,
+        fast_image_processor_name=fast_image_processor_name,
+        content_base_file=content_base_file,
+    )
+
+
+def add_new_model_like_command_factory(args: Namespace):
+    return AddFastImageProcessorCommand(model_name=args.model_name)
+
+
+class AddFastImageProcessorCommand(BaseTransformersCLICommand):
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        add_fast_image_processor_parser = parser.add_parser("add-fast-image-processor")
+        add_fast_image_processor_parser.add_argument(
+            "--model-name",
+            type=str,
+            required=True,
+            help="The name of the folder containing the model's implementation.",
+        )
+        add_fast_image_processor_parser.set_defaults(func=add_new_model_like_command_factory)
+
+    def __init__(self, model_name: str, *args):
+        self.model_name = model_name
+
+    def run(self):
+        add_fast_image_processor(model_name=self.model_name)
diff --git a/venv/lib/python3.13/site-packages/transformers/commands/add_new_model_like.py b/venv/lib/python3.13/site-packages/transformers/commands/add_new_model_like.py
new file mode 100644
index 0000000000000000000000000000000000000000..fce524d4a6c0b219568229ea9b5ff387352a4485
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/commands/add_new_model_like.py
@@ -0,0 +1,783 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import difflib
+import os
+import re
+import subprocess
+import textwrap
+from argparse import ArgumentParser, Namespace
+from datetime import date
+from pathlib import Path
+from typing import Any, Callable, Optional, Union
+
+from ..models.auto.configuration_auto import CONFIG_MAPPING_NAMES, MODEL_NAMES_MAPPING
+from ..models.auto.feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING_NAMES
+from ..models.auto.image_processing_auto import IMAGE_PROCESSOR_MAPPING_NAMES
+from ..models.auto.processing_auto import PROCESSOR_MAPPING_NAMES
+from ..models.auto.tokenization_auto import TOKENIZER_MAPPING_NAMES
+from ..models.auto.video_processing_auto import VIDEO_PROCESSOR_MAPPING_NAMES
+from ..utils import is_libcst_available
+from . import BaseTransformersCLICommand
+from .add_fast_image_processor import add_fast_image_processor
+
+
+# We protect this import to avoid requiring it for all `transformers` CLI commands - however it is actually
+# strictly required for this one (we need it both for modular and for the following Visitor)
+if is_libcst_available():
+    import libcst as cst
+    from libcst import CSTVisitor
+    from libcst import matchers as m
+
+    class ClassFinder(CSTVisitor):
+        """
+        A visitor to find all classes in a python module.
+        """
+
+        def __init__(self):
+            self.classes: list = []
+            self.public_classes: list = []
+            self.is_in_class = False
+
+        def visit_ClassDef(self, node: cst.ClassDef) -> None:
+            """Record class names. We assume classes always only appear at top-level (i.e. no class definition in function or similar)"""
+            self.classes.append(node.name.value)
+            self.is_in_class = True
+
+        def leave_ClassDef(self, node: cst.ClassDef):
+            self.is_in_class = False
+
+        def visit_SimpleStatementLine(self, node: cst.SimpleStatementLine):
+            """Record all public classes inside the `__all__` assignment."""
+            simple_top_level_assign_structure = m.SimpleStatementLine(
+                body=[m.Assign(targets=[m.AssignTarget(target=m.Name())])]
+            )
+            if not self.is_in_class and m.matches(node, simple_top_level_assign_structure):
+                assigned_variable = node.body[0].targets[0].target.value
+                if assigned_variable == "__all__":
+                    elements = node.body[0].value.elements
+                    self.public_classes = [element.value.value for element in elements]
+
+
+CURRENT_YEAR = date.today().year
+TRANSFORMERS_PATH = Path(__file__).parent.parent
+REPO_PATH = TRANSFORMERS_PATH.parent.parent
+
+COPYRIGHT = f"""
+# coding=utf-8
+# Copyright {CURRENT_YEAR} the HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""".lstrip()
+
+
+class ModelInfos:
+    """
+    Retrieve the basic information about an existing model classes.
+    """
+
+    def __init__(self, lowercase_name: str):
+        # Just to make sure it's indeed lowercase
+        self.lowercase_name = lowercase_name.lower().replace(" ", "_").replace("-", "_")
+        if self.lowercase_name not in CONFIG_MAPPING_NAMES:
+            self.lowercase_name.replace("_", "-")
+        if self.lowercase_name not in CONFIG_MAPPING_NAMES:
+            raise ValueError(f"{lowercase_name} is not a valid model name")
+
+        self.paper_name = MODEL_NAMES_MAPPING[self.lowercase_name]
+        self.config_class = CONFIG_MAPPING_NAMES[self.lowercase_name]
+        self.camelcase_name = self.config_class.replace("Config", "")
+
+        # Get tokenizer class
+        if self.lowercase_name in TOKENIZER_MAPPING_NAMES:
+            self.tokenizer_class, self.fast_tokenizer_class = TOKENIZER_MAPPING_NAMES[self.lowercase_name]
+            self.fast_tokenizer_class = (
+                None if self.fast_tokenizer_class == "PreTrainedTokenizerFast" else self.fast_tokenizer_class
+            )
+        else:
+            self.tokenizer_class, self.fast_tokenizer_class = None, None
+
+        self.image_processor_class, self.fast_image_processor_class = IMAGE_PROCESSOR_MAPPING_NAMES.get(
+            self.lowercase_name, (None, None)
+        )
+        self.video_processor_class = VIDEO_PROCESSOR_MAPPING_NAMES.get(self.lowercase_name, None)
+        self.feature_extractor_class = FEATURE_EXTRACTOR_MAPPING_NAMES.get(self.lowercase_name, None)
+        self.processor_class = PROCESSOR_MAPPING_NAMES.get(self.lowercase_name, None)
+
+
+def add_content_to_file(file_name: Union[str, os.PathLike], new_content: str, add_after: str):
+    """
+    A utility to add some content inside a given file.
+
+    Args:
+        file_name (`str` or `os.PathLike`):
+            The name of the file in which we want to insert some content.
+        new_content (`str`):
+            The content to add.
+       add_after (`str`):
+           The new content is added just after the first instance matching it.
+    """
+    with open(file_name, "r", encoding="utf-8") as f:
+        old_content = f.read()
+
+    before, after = old_content.split(add_after, 1)
+    new_content = before + add_after + new_content + after
+
+    with open(file_name, "w", encoding="utf-8") as f:
+        f.write(new_content)
+
+
+def add_model_to_auto_mappings(
+    old_model_infos: ModelInfos,
+    new_lowercase_name: str,
+    new_model_paper_name: str,
+    filenames_to_add: list[tuple[str, bool]],
+):
+    """
+    Add a model to all the relevant mappings in the auto module.
+
+    Args:
+        old_model_infos (`ModelInfos`):
+            The structure containing the class information of the old model.
+        new_lowercase_name (`str`):
+            The new lowercase model name.
+        new_model_paper_name (`str`):
+            The fully cased name (as in the official paper name) of the new model.
+        filenames_to_add (`list[tuple[str, bool]]`):
+            A list of tuples of all potential filenames to add for a new model, along a boolean flag describing if we
+            should add this file or not. For example, [(`modeling_xxx.px`, True), (`configuration_xxx.py`, True), (`tokenization_xxx.py`, False),...]
+    """
+    new_cased_name = "".join(x.title() for x in new_lowercase_name.replace("-", "_").split("_"))
+    old_lowercase_name = old_model_infos.lowercase_name
+    old_cased_name = old_model_infos.camelcase_name
+    filenames_to_add = [
+        (filename.replace(old_lowercase_name, "auto"), to_add) for filename, to_add in filenames_to_add[1:]
+    ]
+    # fast tokenizer/image processor have the same auto mappings as normal ones
+    corrected_filenames_to_add = []
+    for file, to_add in filenames_to_add:
+        if re.search(r"(?:tokenization)|(?:image_processing)_auto_fast.py", file):
+            previous_file, previous_to_add = corrected_filenames_to_add[-1]
+            corrected_filenames_to_add[-1] = (previous_file, previous_to_add or to_add)
+        else:
+            corrected_filenames_to_add.append((file, to_add))
+
+    # Add the config mappings directly as the handling for config is a bit different
+    add_content_to_file(
+        TRANSFORMERS_PATH / "models" / "auto" / "configuration_auto.py",
+        new_content=f'        ("{new_lowercase_name}", "{new_cased_name}Config"),\n',
+        add_after="CONFIG_MAPPING_NAMES = OrderedDict[str, str](\n    [\n        # Add configs here\n",
+    )
+    add_content_to_file(
+        TRANSFORMERS_PATH / "models" / "auto" / "configuration_auto.py",
+        new_content=f'        ("{new_lowercase_name}", "{new_model_paper_name}"),\n',
+        add_after="MODEL_NAMES_MAPPING = OrderedDict[str, str](\n    [\n        # Add full (and cased) model names here\n",
+    )
+
+    for filename, to_add in corrected_filenames_to_add:
+        if to_add:
+            # The auto mapping
+            filename = filename.replace("_fast.py", ".py")
+            with open(TRANSFORMERS_PATH / "models" / "auto" / filename) as f:
+                file = f.read()
+            # The regex has to be a bit complex like this as the tokenizer mapping has new lines everywhere
+            matching_lines = re.findall(
+                rf'( {{8,12}}\(\s*"{old_lowercase_name}",.*?\),\n)(?: {{4,12}}\(|\])', file, re.DOTALL
+            )
+            for match in matching_lines:
+                add_content_to_file(
+                    TRANSFORMERS_PATH / "models" / "auto" / filename,
+                    new_content=match.replace(old_lowercase_name, new_lowercase_name).replace(
+                        old_cased_name, new_cased_name
+                    ),
+                    add_after=match,
+                )
+
+
+def create_doc_file(new_paper_name: str, public_classes: list[str]):
+    """
+    Create a new doc file to fill for the new model.
+
+    Args:
+        new_paper_name (`str`):
+            The fully cased name (as in the official paper name) of the new model.
+        public_classes (`list[str]`):
+            A list of all the public classes that the model will have in the library.
+    """
+    added_note = (
+        "\n\n⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that "
+        "may not be rendered properly in your Markdown viewer.\n\n-->\n\n"
+    )
+    copyright_for_markdown = re.sub(r"# ?", "", COPYRIGHT).replace("coding=utf-8\n", "<!--") + added_note
+
+    doc_template = textwrap.dedent(
+        f"""
+        # {new_paper_name}
+
+        ## Overview
+
+        The {new_paper_name} model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+        <INSERT SHORT SUMMARY HERE>
+
+        The abstract from the paper is the following:
+
+        <INSERT PAPER ABSTRACT HERE>
+
+        Tips:
+
+        <INSERT TIPS ABOUT MODEL HERE>
+
+        This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+        The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+        ## Usage examples
+
+        <INSERT SOME NICE EXAMPLES HERE>
+
+        """
+    )
+
+    # Add public classes doc
+    doc_for_classes = []
+    for class_ in public_classes:
+        doc = f"## {class_}\n\n[[autodoc]] {class_}"
+        if "Model" in class_:
+            doc += "\n    - forward"
+        doc_for_classes.append(doc)
+
+    class_doc = "\n\n".join(doc_for_classes)
+
+    return copyright_for_markdown + doc_template + class_doc
+
+
+def insert_model_in_doc_toc(old_lowercase_name: str, new_lowercase_name: str, new_model_paper_name: str):
+    """
+    Insert the new model in the doc `_toctree.yaml`, in the same section as the old model.
+
+    Args:
+        old_lowercase_name (`str`):
+            The old lowercase model name.
+        new_lowercase_name (`str`):
+            The old lowercase model name.
+        new_model_paper_name (`str`):
+            The fully cased name (as in the official paper name) of the new model.
+    """
+    toc_file = REPO_PATH / "docs" / "source" / "en" / "_toctree.yml"
+    with open(toc_file, "r") as f:
+        content = f.read()
+
+    old_model_toc = re.search(rf"- local: model_doc/{old_lowercase_name}\n {{8}}title: .*?\n", content).group(0)
+    new_toc = f"      - local: model_doc/{new_lowercase_name}\n        title: {new_model_paper_name}\n"
+    add_content_to_file(
+        REPO_PATH / "docs" / "source" / "en" / "_toctree.yml", new_content=new_toc, add_after=old_model_toc
+    )
+
+
+def create_init_file(old_lowercase_name: str, new_lowercase_name: str, filenames_to_add: list[tuple[str, bool]]):
+    """
+    Create the `__init__.py` file to add in the new model folder.
+
+    Args:
+        old_lowercase_name (`str`):
+            The old lowercase model name.
+        new_lowercase_name (`str`):
+            The new lowercase model name.
+        filenames_to_add (`list[tuple[str, bool]]`):
+            A list of tuples of all potential filenames to add for a new model, along a boolean flag describing if we
+            should add this file or not. For example, [(`modeling_xxx.px`, True), (`configuration_xxx.py`, True), (`tokenization_xxx.py`, False),...]
+    """
+    filenames_to_add = [
+        (filename.replace(old_lowercase_name, new_lowercase_name).replace(".py", ""), to_add)
+        for filename, to_add in filenames_to_add
+    ]
+    imports = "\n            ".join(f"from .{file} import *" for file, to_add in filenames_to_add if to_add)
+    init_file = COPYRIGHT + textwrap.dedent(
+        f"""
+        from typing import TYPE_CHECKING
+
+        from ...utils import _LazyModule
+        from ...utils.import_utils import define_import_structure
+
+
+        if TYPE_CHECKING:
+            {imports}
+        else:
+            import sys
+
+            _file = globals()["__file__"]
+            sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
+        """
+    )
+    return init_file
+
+
+def find_all_classes_from_file(module_name: str) -> set:
+    """
+    Find the name of all classes defined in `module_name`, including public ones (defined in `__all__`).
+
+    Args:
+        module_name (`str`):
+            The full path to the python module from which to extract classes.
+    """
+    with open(module_name, "r", encoding="utf-8") as file:
+        source_code = file.read()
+    module = cst.parse_module(source_code)
+    visitor = ClassFinder()
+    module.visit(visitor)
+    return visitor.classes, visitor.public_classes
+
+
+def find_modular_structure(
+    module_name: str, old_model_infos: ModelInfos, new_cased_name: str
+) -> tuple[str, str, list]:
+    """
+    Extract the modular structure that will be needed to copy a file `module_name` using modular.
+
+    Args:
+        module_name (`str`):
+            The full path to the python module to copy with modular.
+        old_model_infos (`ModelInfos`):
+            The structure containing the class information of the old model.
+        new_cased_name (`str`):
+            The new cased model name.
+    """
+    all_classes, public_classes = find_all_classes_from_file(module_name)
+    import_location = ".".join(module_name.parts[-2:]).replace(".py", "")
+    old_cased_name = old_model_infos.camelcase_name
+    imports = f"from ..{import_location} import {', '.join(class_ for class_ in all_classes)}"
+    modular_classes = "\n\n".join(
+        f"class {class_.replace(old_cased_name, new_cased_name)}({class_}):\n    pass" for class_ in all_classes
+    )
+    public_classes = [class_.replace(old_cased_name, new_cased_name) for class_ in public_classes]
+    return imports, modular_classes, public_classes
+
+
+def create_modular_file(
+    old_model_infos: ModelInfos,
+    new_lowercase_name: str,
+    filenames_to_add: list[tuple[str, bool]],
+) -> str:
+    """
+    Create a new modular file which will copy the old model, based on the new name and the different filenames
+    (modules) to add.
+
+    Args:
+        old_model_infos (`ModelInfos`):
+            The structure containing the class information of the old model.
+        new_lowercase_name (`str`):
+            The new lowercase model name.
+        filenames_to_add (`list[tuple[str, bool]]`):
+            A list of tuples of all potential filenames to add for a new model, along a boolean flag describing if we
+            should add this file or not. For example, [(`modeling_xxx.px`, True), (`configuration_xxx.py`, True), (`tokenization_xxx.py`, False),...]
+    """
+    new_cased_name = "".join(x.title() for x in new_lowercase_name.replace("-", "_").split("_"))
+    old_lowercase_name = old_model_infos.lowercase_name
+    old_folder_root = TRANSFORMERS_PATH / "models" / old_lowercase_name
+
+    # Construct the modular file from the original (old) model, by subclassing each class
+    all_imports = ""
+    all_bodies = ""
+    all_public_classes = []
+    for filename, to_add in filenames_to_add:
+        if to_add:
+            imports, body, public_classes = find_modular_structure(
+                old_folder_root / filename, old_model_infos, new_cased_name
+            )
+            all_imports += f"\n{imports}"
+            all_bodies += f"\n\n{body}"
+            all_public_classes.extend(public_classes)
+
+    # Create the __all__ assignment
+    public_classes_formatted = "\n            ".join(f"{public_class}," for public_class in all_public_classes)
+    all_statement = textwrap.dedent(
+        f"""
+
+        __all__ = [
+            {public_classes_formatted}
+        ]
+        """
+    )
+    # Create the whole modular file
+    modular_file = COPYRIGHT + all_imports + all_bodies + all_statement
+    # Remove outer explicit quotes "" around the public class names before returning them
+    all_public_classes = [public_class.replace('"', "") for public_class in all_public_classes]
+    return modular_file, all_public_classes
+
+
+def create_test_files(old_model_infos: ModelInfos, new_lowercase_name, filenames_to_add: list[tuple[str, bool]]):
+    """
+    Create the test files for the new model. It basically copies over the old test files and adjust the class names.
+
+    Args:
+        old_model_infos (`ModelInfos`):
+            The structure containing the class information of the old model.
+        new_lowercase_name (`str`):
+            The new lowercase model name.
+        filenames_to_add (`list[tuple[str, bool]]`):
+            A list of tuples of all potential filenames to add for a new model, along a boolean flag describing if we
+            should add this file or not. For example, [(`modeling_xxx.px`, True), (`configuration_xxx.py`, True), (`tokenization_xxx.py`, False),...]
+    """
+    new_cased_name = "".join(x.title() for x in new_lowercase_name.replace("-", "_").split("_"))
+    old_lowercase_name = old_model_infos.lowercase_name
+    old_cased_name = old_model_infos.camelcase_name
+    filenames_to_add = [
+        ("test_" + filename.replace(old_lowercase_name, new_lowercase_name), to_add)
+        for filename, to_add in filenames_to_add[1:]
+    ]
+    # fast tokenizer/image processor have the same test files as normal ones
+    corrected_filenames_to_add = []
+    for file, to_add in filenames_to_add:
+        if re.search(rf"test_(?:tokenization)|(?:image_processing)_{new_lowercase_name}_fast.py", file):
+            previous_file, previous_to_add = corrected_filenames_to_add[-1]
+            corrected_filenames_to_add[-1] = (previous_file, previous_to_add or to_add)
+        else:
+            corrected_filenames_to_add.append((file, to_add))
+
+    test_files = {}
+    for new_file, to_add in corrected_filenames_to_add:
+        if to_add:
+            original_test_file = new_file.replace(new_lowercase_name, old_lowercase_name)
+            original_test_path = REPO_PATH / "tests" / "models" / old_lowercase_name / original_test_file
+            # Sometimes, tests may not exist
+            if not original_test_path.is_file():
+                continue
+            with open(original_test_path, "r") as f:
+                test_code = f.read()
+            # Remove old copyright and add new one
+            test_lines = test_code.split("\n")
+            idx = 0
+            while test_lines[idx].startswith("#"):
+                idx += 1
+            test_code = COPYRIGHT + "\n".join(test_lines[idx:])
+            test_files[new_file] = test_code.replace(old_cased_name, new_cased_name)
+
+    return test_files
+
+
+def create_new_model_like(
+    old_model_infos: ModelInfos,
+    new_lowercase_name: str,
+    new_model_paper_name: str,
+    filenames_to_add: list[tuple[str, bool]],
+    create_fast_image_processor: bool,
+):
+    """
+    Creates a new model module like a given model of the Transformers library.
+
+    Args:
+        old_model_infos (`ModelInfos`):
+            The structure containing the class information of the old model.
+        new_lowercase_name (`str`):
+            The new lowercase model name.
+        new_model_paper_name (`str`):
+            The fully cased name (as in the official paper name) of the new model.
+        filenames_to_add (`list[tuple[str, bool]]`):
+            A list of tuples of all potential filenames to add for a new model, along a boolean flag describing if we
+            should add this file or not. For example, [(`modeling_xxx.px`, True), (`configuration_xxx.py`, True), (`tokenization_xxx.py`, False),...]
+        create_fast_image_processor (`bool`):
+            If it makes sense, whether to add a fast processor as well, even if the old model does not have one.
+    """
+    # As the import was protected, raise if not present (as it's actually a hard dependency for this command)
+    if not is_libcst_available():
+        raise ValueError("You need to install `libcst` to run this command -> `pip install libcst`")
+
+    old_lowercase_name = old_model_infos.lowercase_name
+
+    # 1. We create the folder for our new model
+    new_module_folder = TRANSFORMERS_PATH / "models" / new_lowercase_name
+    os.makedirs(new_module_folder, exist_ok=True)
+
+    # 2. Create and add the modular file
+    modular_file, public_classes = create_modular_file(old_model_infos, new_lowercase_name, filenames_to_add)
+    with open(new_module_folder / f"modular_{new_lowercase_name}.py", "w") as f:
+        f.write(modular_file)
+
+    # 3. Create and add the __init__.py
+    init_file = create_init_file(old_lowercase_name, new_lowercase_name, filenames_to_add)
+    with open(new_module_folder / "__init__.py", "w") as f:
+        f.write(init_file)
+
+    # 4. Add new model to the models init
+    add_content_to_file(
+        TRANSFORMERS_PATH / "models" / "__init__.py",
+        new_content=f"    from .{new_lowercase_name} import *\n",
+        add_after="if TYPE_CHECKING:\n",
+    )
+
+    # 5. Add model to auto mappings
+    add_model_to_auto_mappings(old_model_infos, new_lowercase_name, new_model_paper_name, filenames_to_add)
+
+    # 6. Add test files
+    tests_folder = REPO_PATH / "tests" / "models" / new_lowercase_name
+    os.makedirs(tests_folder, exist_ok=True)
+    # Add empty __init__.py
+    with open(tests_folder / "__init__.py", "w"):
+        pass
+    test_files = create_test_files(old_model_infos, new_lowercase_name, filenames_to_add)
+    for filename, content in test_files.items():
+        with open(tests_folder / filename, "w") as f:
+            f.write(content)
+
+    # 7. Add doc file
+    doc_file = create_doc_file(new_model_paper_name, public_classes)
+    with open(REPO_PATH / "docs" / "source" / "en" / "model_doc" / f"{new_lowercase_name}.md", "w") as f:
+        f.write(doc_file)
+    insert_model_in_doc_toc(old_lowercase_name, new_lowercase_name, new_model_paper_name)
+
+    # 8. Add additional fast image processor if necessary
+    if create_fast_image_processor:
+        add_fast_image_processor(model_name=new_lowercase_name)
+
+    # 9. Run linters
+    model_init_file = TRANSFORMERS_PATH / "models" / "__init__.py"
+    subprocess.run(
+        ["ruff", "check", new_module_folder, tests_folder, model_init_file, "--fix"],
+        cwd=REPO_PATH,
+        stdout=subprocess.DEVNULL,
+    )
+    subprocess.run(
+        ["ruff", "format", new_module_folder, tests_folder, model_init_file],
+        cwd=REPO_PATH,
+        stdout=subprocess.DEVNULL,
+    )
+    subprocess.run(
+        ["python", "utils/check_doc_toc.py", "--fix_and_overwrite"], cwd=REPO_PATH, stdout=subprocess.DEVNULL
+    )
+    subprocess.run(["python", "utils/sort_auto_mappings.py"], cwd=REPO_PATH, stdout=subprocess.DEVNULL)
+
+    # 10. Run the modular conversion
+    subprocess.run(
+        ["python", "utils/modular_model_converter.py", new_lowercase_name], cwd=REPO_PATH, stdout=subprocess.DEVNULL
+    )
+
+
+def get_user_field(
+    question: str,
+    default_value: Optional[str] = None,
+    convert_to: Optional[Callable] = None,
+    fallback_message: Optional[str] = None,
+) -> Any:
+    """
+    A utility function that asks a question to the user to get an answer, potentially looping until it gets a valid
+    answer.
+
+    Args:
+        question (`str`):
+            The question to ask the user.
+        default_value (`str`, *optional*):
+            A potential default value that will be used when the answer is empty.
+        convert_to (`Callable`, *optional*):
+            If set, the answer will be passed to this function. If this function raises an error on the provided
+            answer, the question will be asked again.
+        fallback_message (`str`, *optional*):
+            A message that will be displayed each time the question is asked again to the user.
+
+    Returns:
+        `Any`: The answer provided by the user (or the default), passed through the potential conversion function.
+    """
+    if not question.endswith(" "):
+        question = question + " "
+    if default_value is not None:
+        question = f"{question} [{default_value}] "
+
+    valid_answer = False
+    while not valid_answer:
+        answer = input(question)
+        if default_value is not None and len(answer) == 0:
+            answer = default_value
+        if convert_to is not None:
+            try:
+                answer = convert_to(answer)
+                valid_answer = True
+            except Exception:
+                valid_answer = False
+        else:
+            valid_answer = True
+
+        if not valid_answer:
+            print(fallback_message)
+
+    return answer
+
+
+def convert_to_bool(x: str) -> bool:
+    """
+    Converts a string to a bool.
+    """
+    if x.lower() in ["1", "y", "yes", "true"]:
+        return True
+    if x.lower() in ["0", "n", "no", "false"]:
+        return False
+    raise ValueError(f"{x} is not a value that can be converted to a bool.")
+
+
+def get_user_input():
+    """
+    Ask the user for the necessary inputs to add the new model.
+    """
+    model_types = list(MODEL_NAMES_MAPPING.keys())
+
+    # Get old model type
+    valid_model_type = False
+    while not valid_model_type:
+        old_model_type = input(
+            "What model would you like to duplicate? Please provide it as lowercase, e.g. `llama`): "
+        )
+        if old_model_type in model_types:
+            valid_model_type = True
+        else:
+            print(f"{old_model_type} is not a valid model type.")
+            near_choices = difflib.get_close_matches(old_model_type, model_types)
+            if len(near_choices) >= 1:
+                if len(near_choices) > 1:
+                    near_choices = " or ".join(near_choices)
+                print(f"Did you mean {near_choices}?")
+
+    old_model_infos = ModelInfos(old_model_type)
+
+    # Ask for the new model name
+    new_lowercase_name = get_user_field(
+        "What is the new model name? Please provide it as snake lowercase, e.g. `new_model`?"
+    )
+    new_model_paper_name = get_user_field(
+        "What is the fully cased name you would like to appear in the doc (e.g. `NeW ModEl`)? ",
+        default_value="".join(x.title() for x in new_lowercase_name.split("_")),
+    )
+
+    # Ask if we want to add individual processor classes as well
+    add_tokenizer = False
+    add_fast_tokenizer = False
+    add_image_processor = False
+    add_fast_image_processor = False
+    add_video_processor = False
+    add_feature_extractor = False
+    add_processor = False
+    if old_model_infos.tokenizer_class is not None:
+        add_tokenizer = get_user_field(
+            f"Do you want to create a new tokenizer? If `no`, it will use the same as {old_model_type} (y/n)?",
+            convert_to=convert_to_bool,
+            fallback_message="Please answer yes/no, y/n, true/false or 1/0. ",
+        )
+    if old_model_infos.fast_tokenizer_class is not None:
+        add_fast_tokenizer = get_user_field(
+            f"Do you want to create a new fast tokenizer? If `no`, it will use the same as {old_model_type} (y/n)?",
+            convert_to=convert_to_bool,
+            fallback_message="Please answer yes/no, y/n, true/false or 1/0. ",
+        )
+    if old_model_infos.image_processor_class is not None:
+        add_image_processor = get_user_field(
+            f"Do you want to create a new image processor? If `no`, it will use the same as {old_model_type} (y/n)?",
+            convert_to=convert_to_bool,
+            fallback_message="Please answer yes/no, y/n, true/false or 1/0. ",
+        )
+    if old_model_infos.fast_image_processor_class is not None:
+        add_fast_image_processor = get_user_field(
+            f"Do you want to create a new fast image processor? If `no`, it will use the same as {old_model_type} (y/n)?",
+            convert_to=convert_to_bool,
+            fallback_message="Please answer yes/no, y/n, true/false or 1/0. ",
+        )
+    if old_model_infos.video_processor_class is not None:
+        add_video_processor = get_user_field(
+            f"Do you want to create a new video processor? If `no`, it will use the same as {old_model_type} (y/n)?",
+            convert_to=convert_to_bool,
+            fallback_message="Please answer yes/no, y/n, true/false or 1/0. ",
+        )
+    if old_model_infos.feature_extractor_class is not None:
+        add_feature_extractor = get_user_field(
+            f"Do you want to create a new feature extractor? If `no`, it will use the same as {old_model_type} (y/n)?",
+            convert_to=convert_to_bool,
+            fallback_message="Please answer yes/no, y/n, true/false or 1/0. ",
+        )
+    if old_model_infos.processor_class is not None:
+        add_processor = get_user_field(
+            f"Do you want to create a new processor? If `no`, it will use the same as {old_model_type} (y/n)?",
+            convert_to=convert_to_bool,
+            fallback_message="Please answer yes/no, y/n, true/false or 1/0. ",
+        )
+
+    old_lowercase_name = old_model_infos.lowercase_name
+    # A list of the old filenames, along whether we should copy them or not
+    filenames_to_add = (
+        (f"configuration_{old_lowercase_name}.py", True),
+        (f"modeling_{old_lowercase_name}.py", True),
+        (f"tokenization_{old_lowercase_name}.py", add_tokenizer),
+        (f"tokenization_{old_lowercase_name}_fast.py", add_fast_tokenizer),
+        (f"image_processing_{old_lowercase_name}.py", add_image_processor),
+        (f"image_processing_{old_lowercase_name}_fast.py", add_fast_image_processor),
+        (f"video_processing_{old_lowercase_name}.py", add_video_processor),
+        (f"feature_extraction_{old_lowercase_name}.py", add_feature_extractor),
+        (f"processing_{old_lowercase_name}.py", add_processor),
+    )
+
+    create_fast_image_processor = False
+    if add_image_processor and not add_fast_image_processor:
+        create_fast_image_processor = get_user_field(
+            "A fast image processor can be created from the slow one, but modifications might be needed. "
+            "Should we add a fast image processor class for this model (recommended) (y/n)? ",
+            convert_to=convert_to_bool,
+            default_value="y",
+            fallback_message="Please answer yes/no, y/n, true/false or 1/0.",
+        )
+
+    return old_model_infos, new_lowercase_name, new_model_paper_name, filenames_to_add, create_fast_image_processor
+
+
+def add_new_model_like_command_factory(args: Namespace):
+    return AddNewModelLikeCommand(path_to_repo=args.path_to_repo)
+
+
+class AddNewModelLikeCommand(BaseTransformersCLICommand):
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        add_new_model_like_parser = parser.add_parser("add-new-model-like")
+        add_new_model_like_parser.add_argument(
+            "--path_to_repo", type=str, help="When not using an editable install, the path to the Transformers repo."
+        )
+        add_new_model_like_parser.set_defaults(func=add_new_model_like_command_factory)
+
+    def __init__(self, path_to_repo=None, **kwargs):
+        (
+            self.old_model_infos,
+            self.new_lowercase_name,
+            self.new_model_paper_name,
+            self.filenames_to_add,
+            self.create_fast_image_processor,
+        ) = get_user_input()
+        self.path_to_repo = path_to_repo
+
+    def run(self):
+        if self.path_to_repo is not None:
+            # Adapt constants
+            global TRANSFORMERS_PATH
+            global REPO_PATH
+
+            REPO_PATH = Path(self.path_to_repo)
+            TRANSFORMERS_PATH = REPO_PATH / "src" / "transformers"
+
+        create_new_model_like(
+            old_model_infos=self.old_model_infos,
+            new_lowercase_name=self.new_lowercase_name,
+            new_model_paper_name=self.new_model_paper_name,
+            filenames_to_add=self.filenames_to_add,
+            create_fast_image_processor=self.create_fast_image_processor,
+        )
diff --git a/venv/lib/python3.13/site-packages/transformers/commands/chat.py b/venv/lib/python3.13/site-packages/transformers/commands/chat.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ddf90164ba71814111ee3f9e0716e598e154553
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/commands/chat.py
@@ -0,0 +1,760 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import asyncio
+import copy
+import json
+import os
+import platform
+import re
+import string
+import time
+from argparse import ArgumentParser, Namespace
+from collections.abc import AsyncIterator
+from dataclasses import dataclass, field
+from threading import Thread
+from typing import Optional
+
+import yaml
+from huggingface_hub import AsyncInferenceClient, ChatCompletionStreamOutput
+
+from transformers import (
+    AutoTokenizer,
+    GenerationConfig,
+    PreTrainedTokenizer,
+)
+from transformers.commands import BaseTransformersCLICommand
+from transformers.commands.serving import ServeArguments, ServeCommand
+from transformers.utils import is_rich_available, is_torch_available
+
+
+try:
+    import readline  # noqa importing this enables GNU readline capabilities
+except ImportError:
+    # some platforms may not support readline: https://docs.python.org/3/library/readline.html
+    pass
+
+if platform.system() != "Windows":
+    import pwd
+
+if is_rich_available():
+    from rich.console import Console
+    from rich.live import Live
+    from rich.markdown import Markdown
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        AutoModelForCausalLM,
+        BitsAndBytesConfig,
+    )
+
+ALLOWED_KEY_CHARS = set(string.ascii_letters + string.whitespace)
+ALLOWED_VALUE_CHARS = set(
+    string.ascii_letters + string.digits + string.whitespace + r".!\"#$%&'()*+,\-/:<=>?@[]^_`{|}~"
+)
+
+DEFAULT_EXAMPLES = {
+    "llama": {"text": "There is a Llama in my lawn, how can I get rid of it?"},
+    "code": {
+        "text": (
+            "Write a Python function that integrates any Python function f(x) numerically over an arbitrary "
+            "interval [x_start, x_end]."
+        ),
+    },
+    "helicopter": {"text": "How many helicopters can a human eat in one sitting?"},
+    "numbers": {"text": "Count to 10 but skip every number ending with an 'e'"},
+    "birds": {"text": "Why aren't birds real?"},
+    "socks": {"text": "Why is it important to eat socks after meditating?"},
+    "numbers2": {"text": "Which number is larger, 9.9 or 9.11?"},
+}
+
+# Printed at the start of a chat session
+HELP_STRING_MINIMAL = """
+
+**TRANSFORMERS CHAT INTERFACE**
+
+Chat interface to try out a model. Besides chatting with the model, here are some basic commands:
+- **!help**: shows all available commands (set generation settings, save chat, etc.)
+- **!status**: shows the current status of the model and generation settings
+- **!clear**: clears the current conversation and starts a new one
+- **!exit**: closes the interface
+"""
+
+
+# Printed when the user types `help` in the chat session
+HELP_STRING = f"""
+
+**TRANSFORMERS CHAT INTERFACE HELP**
+
+Full command list:
+- **!help**: shows this help message
+- **!clear**: clears the current conversation and starts a new one
+- **!status**: shows the current status of the model and generation settings
+- **!example {{NAME}}**: loads example named `{{NAME}}` from the config and uses it as the user input.
+Available example names: `{"`, `".join(DEFAULT_EXAMPLES.keys())}`
+- **!set {{ARG_1}}={{VALUE_1}} {{ARG_2}}={{VALUE_2}}** ...: changes the system prompt or generation settings (multiple
+settings are separated by a space). Accepts the same flags and format as the `generate_flags` CLI argument.
+If you're a new user, check this basic flag guide: https://huggingface.co/docs/transformers/llm_tutorial#common-options
+- **!save {{SAVE_NAME}} (optional)**: saves the current chat and settings to file by default to
+`./chat_history/{{MODEL_NAME}}/chat_{{DATETIME}}.yaml` or `{{SAVE_NAME}}` if provided
+- **!exit**: closes the interface
+"""
+
+
+class RichInterface:
+    def __init__(self, model_name: Optional[str] = None, user_name: Optional[str] = None):
+        self._console = Console()
+        if model_name is None:
+            self.model_name = "assistant"
+        else:
+            self.model_name = model_name
+        if user_name is None:
+            self.user_name = "user"
+        else:
+            self.user_name = user_name
+
+    async def stream_output(self, stream: AsyncIterator[ChatCompletionStreamOutput]) -> tuple[str, int]:
+        self._console.print(f"[bold blue]<{self.model_name}>:")
+        with Live(console=self._console, refresh_per_second=4) as live:
+            text = ""
+            async for token in await stream:
+                outputs = token.choices[0].delta.content
+
+                if not outputs:
+                    continue
+
+                # Escapes single words encased in <>, e.g. <think> -> \<think\>, for proper rendering in Markdown.
+                # It only escapes single words that may have `_`, optionally following a `/` (e.g. </think>)
+                outputs = re.sub(r"<(/*)(\w*)>", r"\<\1\2\>", outputs)
+
+                text += outputs
+                # Render the accumulated text as Markdown
+                # NOTE: this is a workaround for the rendering "unstandard markdown"
+                #  in rich. The chatbots output treat "\n" as a new line for
+                #  better compatibility with real-world text. However, rendering
+                #  in markdown would break the format. It is because standard markdown
+                #  treat a single "\n" in normal text as a space.
+                #  Our workaround is adding two spaces at the end of each line.
+                #  This is not a perfect solution, as it would
+                #  introduce trailing spaces (only) in code block, but it works well
+                #  especially for console output, because in general the console does not
+                #  care about trailing spaces.
+
+                lines = []
+                for line in text.splitlines():
+                    lines.append(line)
+                    if line.startswith("```"):
+                        # Code block marker - do not add trailing spaces, as it would
+                        #  break the syntax highlighting
+                        lines.append("\n")
+                    else:
+                        lines.append("  \n")
+
+                markdown = Markdown("".join(lines).strip(), code_theme="github-dark")
+
+                # Update the Live console output
+                live.update(markdown, refresh=True)
+
+        self._console.print()
+
+        return text
+
+    def input(self) -> str:
+        """Gets user input from the console."""
+        input = self._console.input(f"[bold red]<{self.user_name}>:\n")
+        self._console.print()
+        return input
+
+    def clear(self):
+        """Clears the console."""
+        self._console.clear()
+
+    def print_user_message(self, text: str):
+        """Prints a user message to the console."""
+        self._console.print(f"[bold red]<{self.user_name}>:[/ bold red]\n{text}")
+        self._console.print()
+
+    def print_color(self, text: str, color: str):
+        """Prints text in a given color to the console."""
+        self._console.print(f"[bold {color}]{text}")
+        self._console.print()
+
+    def print_help(self, minimal: bool = False):
+        """Prints the help message to the console."""
+        self._console.print(Markdown(HELP_STRING_MINIMAL if minimal else HELP_STRING))
+        self._console.print()
+
+    def print_status(self, model_name: str, generation_config: GenerationConfig, model_kwargs: dict):
+        """Prints the status of the model and generation settings to the console."""
+        self._console.print(f"[bold blue]Model: {model_name}\n")
+        if model_kwargs:
+            self._console.print(f"[bold blue]Model kwargs: {model_kwargs}")
+        self._console.print(f"[bold blue]{generation_config}")
+        self._console.print()
+
+
+@dataclass
+class ChatArguments:
+    r"""
+    Arguments for the chat CLI.
+
+    See the metadata arg for each argument's description -- the medatata will be printed with
+    `transformers chat --help`
+    """
+
+    # General settings
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Name of the pre-trained model. The positional argument will take precedence if both are passed."
+        },
+    )
+    user: Optional[str] = field(
+        default=None,
+        metadata={"help": "Username to display in chat interface. Defaults to the current user's name."},
+    )
+    system_prompt: Optional[str] = field(default=None, metadata={"help": "System prompt."})
+    save_folder: str = field(default="./chat_history/", metadata={"help": "Folder to save chat history."})
+    examples_path: Optional[str] = field(default=None, metadata={"help": "Path to a yaml file with examples."})
+    verbose: bool = field(default=False, metadata={"help": "Whether to show runtime warnings in the chat interface."})
+
+    # Generation settings
+    generation_config: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Path to a local generation config file or to a HuggingFace repo containing a "
+                "`generation_config.json` file. Other generation settings passed as CLI arguments will be applied on "
+                "top of this generation config."
+            ),
+        },
+    )
+
+    # Model loading
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "Specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    device: str = field(default="auto", metadata={"help": "Device to use for inference."})
+    torch_dtype: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "`torch_dtype` is deprecated! Please use `dtype` argument instead.",
+            "choices": ["auto", "bfloat16", "float16", "float32"],
+        },
+    )
+    dtype: Optional[str] = field(
+        default="auto",
+        metadata={
+            "help": "Override the default `torch.dtype` and load the model under this dtype. If `'auto'` is passed, "
+            "the dtype will be automatically derived from the model's weights.",
+            "choices": ["auto", "bfloat16", "float16", "float32"],
+        },
+    )
+    trust_remote_code: bool = field(
+        default=False, metadata={"help": "Whether to trust remote code when loading a model."}
+    )
+    attn_implementation: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Which attention implementation to use; you can run --attn_implementation=flash_attention_2, in "
+            "which case you must install this manually by running `pip install flash-attn --no-build-isolation`."
+        },
+    )
+    load_in_8bit: bool = field(
+        default=False,
+        metadata={"help": "Whether to use 8 bit precision for the base model - works only with LoRA."},
+    )
+    load_in_4bit: bool = field(
+        default=False,
+        metadata={"help": "Whether to use 4 bit precision for the base model - works only with LoRA."},
+    )
+    bnb_4bit_quant_type: str = field(default="nf4", metadata={"help": "Quantization type.", "choices": ["fp4", "nf4"]})
+    use_bnb_nested_quant: bool = field(default=False, metadata={"help": "Whether to use nested quantization."})
+
+    # Serving settings
+    host: str = field(default="localhost", metadata={"help": "Interface the server will listen to.."})
+    port: int = field(default=8000, metadata={"help": "Port the server will listen to."})
+
+    def __post_init__(self):
+        """Only used for BC `torch_dtype` argument."""
+        # In this case only the BC torch_dtype was given
+        if self.torch_dtype is not None:
+            if self.dtype is None:
+                self.dtype = self.torch_dtype
+            elif self.torch_dtype != self.dtype:
+                raise ValueError(
+                    f"`torch_dtype` {self.torch_dtype} and `dtype` {self.dtype} have different values. `torch_dtype` is deprecated and "
+                    "will be removed in 4.59.0, please set `dtype` instead."
+                )
+
+
+def chat_command_factory(args: Namespace):
+    """
+    Factory function used to chat with a local model.
+    """
+    return ChatCommand(args)
+
+
+class ChatCommand(BaseTransformersCLICommand):
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        """
+        Register this command to argparse so it's available for the transformer-cli
+
+        Args:
+            parser: Root parser to register command-specific arguments
+        """
+        dataclass_types = (ChatArguments,)
+        chat_parser = parser.add_parser("chat", dataclass_types=dataclass_types)
+
+        group = chat_parser.add_argument_group("Positional arguments")
+        group.add_argument(
+            "model_name_or_path_or_address",
+            type=str,
+            default=None,
+            help="Name of the pre-trained model or address to connect to.",
+        )
+        group.add_argument(
+            "generate_flags",
+            type=str,
+            default=None,
+            help=(
+                "Flags to pass to `generate`, using a space as a separator between flags. Accepts booleans, numbers, "
+                "and lists of integers, more advanced parameterization should be set through --generation-config. "
+                "Example: `transformers chat <model_repo> max_new_tokens=100 do_sample=False eos_token_id=[1,2]`. "
+                "If you're a new user, check this basic flag guide: "
+                "https://huggingface.co/docs/transformers/llm_tutorial#common-options"
+            ),
+            nargs="*",
+        )
+        chat_parser.set_defaults(func=chat_command_factory)
+
+    def __init__(self, args):
+        if args.model_name_or_path_or_address is not None:
+            name = args.model_name_or_path_or_address
+            if name.startswith("http") or name.startswith("https") or name.startswith("localhost"):
+                self.spawn_backend = False
+
+                if args.host != "localhost" or args.port != 8000:
+                    raise ValueError(
+                        "Looks like you’ve set both a server address and a custom host/port. "
+                        "Please pick just one way to specify the server."
+                    )
+
+                args.host, args.port = args.model_name_or_path_or_address.rsplit(":", 1)
+
+                if args.model_name_or_path is None:
+                    raise ValueError(
+                        "When connecting to a server, please specify a model name with the --model_name_or_path flag."
+                    )
+            else:
+                self.spawn_backend = True
+                args.model_name_or_path = args.model_name_or_path_or_address
+
+        if not is_rich_available() and (not is_torch_available() and self.spawn_backend):
+            raise ImportError(
+                "You need to install rich to use the chat interface. Additionally, you have not specified a remote "
+                "endpoint and are therefore spawning a backend. Torch is required for this: (`pip install rich torch`)"
+            )
+        elif not is_rich_available():
+            raise ImportError("You need to install rich to use the chat interface. (`pip install rich`)")
+        elif not is_torch_available() and self.spawn_backend:
+            raise ImportError(
+                "You have not specified a remote endpoint and are therefore spawning a backend. Torch is required "
+                "for this: (`pip install rich torch`)"
+            )
+
+        self.args = args
+
+    # -----------------------------------------------------------------------------------------------------------------
+    # Chat session methods
+    @staticmethod
+    def get_username() -> str:
+        """Returns the username of the current user."""
+        if platform.system() == "Windows":
+            return os.getlogin()
+        else:
+            return pwd.getpwuid(os.getuid()).pw_name
+
+    @staticmethod
+    def save_chat(chat, args: ChatArguments, filename: Optional[str] = None) -> str:
+        """Saves the chat history to a file."""
+        output_dict = {}
+        output_dict["settings"] = vars(args)
+        output_dict["chat_history"] = chat
+
+        folder = args.save_folder
+
+        if filename is None:
+            time_str = time.strftime("%Y-%m-%d_%H-%M-%S")
+            filename = f"{args.model_name_or_path_or_address}/chat_{time_str}.json"
+            filename = os.path.join(folder, filename)
+
+        os.makedirs(os.path.dirname(filename), exist_ok=True)
+        with open(filename, "w") as f:
+            json.dump(output_dict, f, indent=4)
+        return os.path.abspath(filename)
+
+    @staticmethod
+    def clear_chat_history(system_prompt: Optional[str] = None) -> list[dict]:
+        """Clears the chat history."""
+        if system_prompt is None:
+            chat = []
+        else:
+            chat = [{"role": "system", "content": system_prompt}]
+        return chat
+
+    # -----------------------------------------------------------------------------------------------------------------
+    # Input parsing methods
+    def parse_generate_flags(self, generate_flags: list[str]) -> dict:
+        """Parses the generate flags from the user input into a dictionary of `generate` kwargs."""
+        if len(generate_flags) == 0:
+            return {}
+
+        # Assumption: `generate_flags` is a list of strings, each string being a `flag=value` pair, that can be parsed
+        # into a json string if we:
+        # 1. Add quotes around each flag name
+        generate_flags_as_dict = {'"' + flag.split("=")[0] + '"': flag.split("=")[1] for flag in generate_flags}
+
+        # 2. Handle types:
+        # 2. a. booleans should be lowercase, None should be null
+        generate_flags_as_dict = {
+            k: v.lower() if v.lower() in ["true", "false"] else v for k, v in generate_flags_as_dict.items()
+        }
+        generate_flags_as_dict = {k: "null" if v == "None" else v for k, v in generate_flags_as_dict.items()}
+
+        # 2. b. strings should be quoted
+        def is_number(s: str) -> bool:
+            # handle negative numbers
+            s = s.removeprefix("-")
+            return s.replace(".", "", 1).isdigit()
+
+        generate_flags_as_dict = {k: f'"{v}"' if not is_number(v) else v for k, v in generate_flags_as_dict.items()}
+        # 2. c. [no processing needed] lists are lists of ints because `generate` doesn't take lists of strings :)
+        # We also mention in the help message that we only accept lists of ints for now.
+
+        # 3. Join the result into a comma separated string
+        generate_flags_string = ", ".join([f"{k}: {v}" for k, v in generate_flags_as_dict.items()])
+
+        # 4. Add the opening/closing brackets
+        generate_flags_string = "{" + generate_flags_string + "}"
+
+        # 5. Remove quotes around boolean/null and around lists
+        generate_flags_string = generate_flags_string.replace('"null"', "null")
+        generate_flags_string = generate_flags_string.replace('"true"', "true")
+        generate_flags_string = generate_flags_string.replace('"false"', "false")
+        generate_flags_string = generate_flags_string.replace('"[', "[")
+        generate_flags_string = generate_flags_string.replace(']"', "]")
+
+        # 6. Replace the `=` with `:`
+        generate_flags_string = generate_flags_string.replace("=", ":")
+
+        try:
+            processed_generate_flags = json.loads(generate_flags_string)
+        except json.JSONDecodeError:
+            raise ValueError(
+                "Failed to convert `generate_flags` into a valid JSON object."
+                "\n`generate_flags` = {generate_flags}"
+                "\nConverted JSON string = {generate_flags_string}"
+            )
+        return processed_generate_flags
+
+    def get_generation_parameterization(
+        self, args: ChatArguments, model_generation_config: GenerationConfig
+    ) -> tuple[GenerationConfig, dict]:
+        """
+        Returns a GenerationConfig object holding the generation parameters for the CLI command.
+        """
+        # No generation config arg provided -> use model's default generation config, then apply CLI defaults
+        if args.generation_config is not None:
+            if ".json" in args.generation_config:  # is a local file
+                dirname = os.path.dirname(args.generation_config)
+                filename = os.path.basename(args.generation_config)
+                generation_config = GenerationConfig.from_pretrained(dirname, filename)
+            else:
+                generation_config = GenerationConfig.from_pretrained(args.generation_config)
+        else:
+            # !!!!!!!!!
+            # This is a chat session, so we have a few non-standard defaults
+            # !!!!!!!!!
+            generation_config = copy.deepcopy(model_generation_config)
+            generation_config.update(**{"do_sample": True, "max_new_tokens": 256})
+
+        # Finally: parse and apply `generate_flags`
+        parsed_generate_flags = self.parse_generate_flags(args.generate_flags)
+        model_kwargs = generation_config.update(**parsed_generate_flags)
+        # `model_kwargs` contain non-generation flags in `parsed_generate_flags` that should be passed directly to
+        # `generate`
+        return generation_config, model_kwargs
+
+    @staticmethod
+    def parse_eos_tokens(
+        tokenizer: PreTrainedTokenizer,
+        generation_config: GenerationConfig,
+        eos_tokens: Optional[str],
+        eos_token_ids: Optional[str],
+    ) -> tuple[int, list[int]]:
+        """Retrieves the pad token ID and all possible EOS token IDs."""
+        if generation_config.pad_token_id is None:
+            pad_token_id = generation_config.eos_token_id
+        else:
+            pad_token_id = generation_config.pad_token_id
+
+        all_eos_token_ids = []
+
+        if eos_tokens is not None:
+            all_eos_token_ids.extend(tokenizer.convert_tokens_to_ids(eos_tokens.split(",")))
+
+        if eos_token_ids is not None:
+            all_eos_token_ids.extend([int(token_id) for token_id in eos_token_ids.split(",")])
+
+        if len(all_eos_token_ids) == 0:
+            all_eos_token_ids.append(generation_config.eos_token_id)
+
+        return pad_token_id, all_eos_token_ids
+
+    # -----------------------------------------------------------------------------------------------------------------
+    # Model loading and performance automation methods
+    @staticmethod
+    def get_quantization_config(model_args: ChatArguments) -> Optional[BitsAndBytesConfig]:
+        if model_args.load_in_4bit:
+            quantization_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                # For consistency with model weights, we use the same value as `dtype`
+                bnb_4bit_compute_dtype=model_args.dtype,
+                bnb_4bit_quant_type=model_args.bnb_4bit_quant_type,
+                bnb_4bit_use_double_quant=model_args.use_bnb_nested_quant,
+                bnb_4bit_quant_storage=model_args.dtype,
+            )
+        elif model_args.load_in_8bit:
+            quantization_config = BitsAndBytesConfig(
+                load_in_8bit=True,
+            )
+        else:
+            quantization_config = None
+
+        return quantization_config
+
+    def load_model_and_tokenizer(self, args: ChatArguments) -> tuple["AutoModelForCausalLM", AutoTokenizer]:
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.model_name_or_path_positional,
+            revision=args.model_revision,
+            trust_remote_code=args.trust_remote_code,
+        )
+
+        dtype = args.dtype if args.dtype in ["auto", None] else getattr(torch, args.dtype)
+        quantization_config = self.get_quantization_config(args)
+        model_kwargs = {
+            "revision": args.model_revision,
+            "attn_implementation": args.attn_implementation,
+            "dtype": dtype,
+            "device_map": "auto",
+            "quantization_config": quantization_config,
+        }
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model_name_or_path_positional, trust_remote_code=args.trust_remote_code, **model_kwargs
+        )
+
+        if getattr(model, "hf_device_map", None) is None:
+            model = model.to(args.device)
+
+        return model, tokenizer
+
+    # -----------------------------------------------------------------------------------------------------------------
+    # User commands
+    def handle_non_exit_user_commands(
+        self,
+        user_input: str,
+        args: ChatArguments,
+        interface: RichInterface,
+        examples: dict[str, dict[str, str]],
+        generation_config: GenerationConfig,
+        model_kwargs: dict,
+        chat: list[dict],
+    ) -> tuple[list[dict], GenerationConfig, dict]:
+        """
+        Handles all user commands except for `!exit`. May update the chat history (e.g. reset it) or the
+        generation config (e.g. set a new flag).
+        """
+        valid_command = True
+
+        if user_input == "!clear":
+            chat = self.clear_chat_history(args.system_prompt)
+            interface.clear()
+
+        elif user_input == "!help":
+            interface.print_help()
+
+        elif user_input.startswith("!save") and len(user_input.split()) < 2:
+            split_input = user_input.split()
+
+            if len(split_input) == 2:
+                filename = split_input[1]
+            else:
+                filename = None
+            filename = self.save_chat(chat, args, filename)
+            interface.print_color(text=f"Chat saved in {filename}!", color="green")
+
+        elif user_input.startswith("!set"):
+            # splits the new args into a list of strings, each string being a `flag=value` pair (same format as
+            # `generate_flags`)
+            new_generate_flags = user_input[4:].strip()
+            new_generate_flags = new_generate_flags.split()
+            # sanity check: each member in the list must have an =
+            for flag in new_generate_flags:
+                if "=" not in flag:
+                    interface.print_color(
+                        text=(
+                            f"Invalid flag format, missing `=` after `{flag}`. Please use the format "
+                            "`arg_1=value_1 arg_2=value_2 ...`."
+                        ),
+                        color="red",
+                    )
+                    break
+            else:
+                # parses the new args into a dictionary of `generate` kwargs, and updates the corresponding variables
+                parsed_new_generate_flags = self.parse_generate_flags(new_generate_flags)
+                new_model_kwargs = generation_config.update(**parsed_new_generate_flags)
+                model_kwargs.update(**new_model_kwargs)
+
+        elif user_input.startswith("!example") and len(user_input.split()) == 2:
+            example_name = user_input.split()[1]
+            if example_name in examples:
+                interface.clear()
+                chat = []
+                interface.print_user_message(examples[example_name]["text"])
+                chat.append({"role": "user", "content": examples[example_name]["text"]})
+            else:
+                example_error = (
+                    f"Example {example_name} not found in list of available examples: {list(examples.keys())}."
+                )
+                interface.print_color(text=example_error, color="red")
+
+        elif user_input == "!status":
+            interface.print_status(
+                model_name=args.model_name_or_path,
+                generation_config=generation_config,
+                model_kwargs=model_kwargs,
+            )
+
+        else:
+            valid_command = False
+            interface.print_color(text=f"'{user_input}' is not a valid command. Showing help message.", color="red")
+            interface.print_help()
+
+        return chat, valid_command, generation_config, model_kwargs
+
+    # -----------------------------------------------------------------------------------------------------------------
+    # Main logic
+    def run(self):
+        asyncio.run(self._inner_run())
+
+    async def _inner_run(self):
+        if self.spawn_backend:
+            serve_args = ServeArguments(
+                device=self.args.device,
+                dtype=self.args.dtype,
+                trust_remote_code=self.args.trust_remote_code,
+                attn_implementation=self.args.attn_implementation,
+                load_in_8bit=self.args.load_in_8bit,
+                load_in_4bit=self.args.load_in_4bit,
+                bnb_4bit_quant_type=self.args.bnb_4bit_quant_type,
+                use_bnb_nested_quant=self.args.use_bnb_nested_quant,
+                host=self.args.host,
+                port=self.args.port,
+                log_level="error",
+            )
+            serve_command = ServeCommand(serve_args)
+
+            thread = Thread(target=serve_command.run)
+            thread.daemon = True
+            thread.start()
+
+        model = self.args.model_name_or_path + "@" + self.args.model_revision
+        host = "http://localhost" if self.args.host == "localhost" else self.args.host
+
+        args = self.args
+        if args.examples_path is None:
+            examples = DEFAULT_EXAMPLES
+        else:
+            with open(args.examples_path) as f:
+                examples = yaml.safe_load(f)
+
+        if args.user is None:
+            user = self.get_username()
+        else:
+            user = args.user
+
+        model_generation_config = GenerationConfig.from_pretrained(args.model_name_or_path)
+        generation_config, model_kwargs = self.get_generation_parameterization(args, model_generation_config)
+
+        interface = RichInterface(model_name=args.model_name_or_path, user_name=user)
+        interface.clear()
+        chat = self.clear_chat_history(args.system_prompt)
+
+        # Starts the session with a minimal help message at the top, so that a user doesn't get stuck
+        interface.print_help(minimal=True)
+
+        async with AsyncInferenceClient(f"{host}:{self.args.port}") as client:
+            while True:
+                try:
+                    user_input = interface.input()
+
+                    # User commands
+                    if user_input.startswith("!"):
+                        # `!exit` is special, it breaks the loop
+                        if user_input == "!exit":
+                            break
+                        else:
+                            chat, valid_command, generation_config, model_kwargs = self.handle_non_exit_user_commands(
+                                user_input=user_input,
+                                args=args,
+                                interface=interface,
+                                examples=examples,
+                                generation_config=generation_config,
+                                model_kwargs=model_kwargs,
+                                chat=chat,
+                            )
+                        # `!example` sends a user message to the model
+                        if not valid_command or not user_input.startswith("!example"):
+                            continue
+                    else:
+                        chat.append({"role": "user", "content": user_input})
+
+                    stream = client.chat_completion(
+                        chat,
+                        stream=True,
+                        extra_body={
+                            "generation_config": generation_config.to_json_string(),
+                            "model": model,
+                        },
+                    )
+
+                    model_output = await interface.stream_output(stream)
+
+                    chat.append({"role": "assistant", "content": model_output})
+                except KeyboardInterrupt:
+                    break
+
+
+if __name__ == "__main__":
+    args = ChatArguments()
+    args.model_name_or_path_or_address = "meta-llama/Llama-3.2-3b-Instruct"
+    args.model_name_or_path_or_address = "http://localhost:8000"
+    chat = ChatCommand(args)
+    chat.run()
diff --git a/venv/lib/python3.13/site-packages/transformers/commands/convert.py b/venv/lib/python3.13/site-packages/transformers/commands/convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..220d1d44b1aa281e3b3af9ad584ad3ad49b6beb0
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/commands/convert.py
@@ -0,0 +1,165 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from argparse import ArgumentParser, Namespace
+
+from ..utils import logging
+from . import BaseTransformersCLICommand
+
+
+def convert_command_factory(args: Namespace):
+    """
+    Factory function used to convert a model TF 1.0 checkpoint in a PyTorch checkpoint.
+
+    Returns: ServeCommand
+    """
+    return ConvertCommand(
+        args.model_type, args.tf_checkpoint, args.pytorch_dump_output, args.config, args.finetuning_task_name
+    )
+
+
+IMPORT_ERROR_MESSAGE = """
+transformers can only be used from the commandline to convert TensorFlow models in PyTorch, In that case, it requires
+TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.
+"""
+
+
+class ConvertCommand(BaseTransformersCLICommand):
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        """
+        Register this command to argparse so it's available for the transformer-cli
+
+        Args:
+            parser: Root parser to register command-specific arguments
+        """
+        train_parser = parser.add_parser(
+            "convert",
+            help="CLI tool to run convert model from original author checkpoints to Transformers PyTorch checkpoints.",
+        )
+        train_parser.add_argument("--model_type", type=str, required=True, help="Model's type.")
+        train_parser.add_argument(
+            "--tf_checkpoint", type=str, required=True, help="TensorFlow checkpoint path or folder."
+        )
+        train_parser.add_argument(
+            "--pytorch_dump_output", type=str, required=True, help="Path to the PyTorch saved model output."
+        )
+        train_parser.add_argument("--config", type=str, default="", help="Configuration file path or folder.")
+        train_parser.add_argument(
+            "--finetuning_task_name",
+            type=str,
+            default=None,
+            help="Optional fine-tuning task name if the TF model was a finetuned model.",
+        )
+        train_parser.set_defaults(func=convert_command_factory)
+
+    def __init__(
+        self,
+        model_type: str,
+        tf_checkpoint: str,
+        pytorch_dump_output: str,
+        config: str,
+        finetuning_task_name: str,
+        *args,
+    ):
+        self._logger = logging.get_logger("transformers/converting")
+
+        self._logger.info(f"Loading model {model_type}")
+        self._model_type = model_type
+        self._tf_checkpoint = tf_checkpoint
+        self._pytorch_dump_output = pytorch_dump_output
+        self._config = config
+        self._finetuning_task_name = finetuning_task_name
+
+    def run(self):
+        if self._model_type == "albert":
+            try:
+                from ..models.albert.convert_albert_original_tf_checkpoint_to_pytorch import (
+                    convert_tf_checkpoint_to_pytorch,
+                )
+            except ImportError:
+                raise ImportError(IMPORT_ERROR_MESSAGE)
+
+            convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
+        elif self._model_type == "bert":
+            try:
+                from ..models.bert.convert_bert_original_tf_checkpoint_to_pytorch import (
+                    convert_tf_checkpoint_to_pytorch,
+                )
+            except ImportError:
+                raise ImportError(IMPORT_ERROR_MESSAGE)
+
+            convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
+        elif self._model_type == "funnel":
+            try:
+                from ..models.funnel.convert_funnel_original_tf_checkpoint_to_pytorch import (
+                    convert_tf_checkpoint_to_pytorch,
+                )
+            except ImportError:
+                raise ImportError(IMPORT_ERROR_MESSAGE)
+
+            convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
+        elif self._model_type == "t5":
+            try:
+                from ..models.t5.convert_t5_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
+            except ImportError:
+                raise ImportError(IMPORT_ERROR_MESSAGE)
+
+            convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
+        elif self._model_type == "gpt":
+            from ..models.openai.convert_openai_original_tf_checkpoint_to_pytorch import (
+                convert_openai_checkpoint_to_pytorch,
+            )
+
+            convert_openai_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
+        elif self._model_type == "gpt2":
+            try:
+                from ..models.gpt2.convert_gpt2_original_tf_checkpoint_to_pytorch import (
+                    convert_gpt2_checkpoint_to_pytorch,
+                )
+            except ImportError:
+                raise ImportError(IMPORT_ERROR_MESSAGE)
+
+            convert_gpt2_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
+        elif self._model_type == "xlnet":
+            try:
+                from ..models.xlnet.convert_xlnet_original_tf_checkpoint_to_pytorch import (
+                    convert_xlnet_checkpoint_to_pytorch,
+                )
+            except ImportError:
+                raise ImportError(IMPORT_ERROR_MESSAGE)
+
+            convert_xlnet_checkpoint_to_pytorch(
+                self._tf_checkpoint, self._config, self._pytorch_dump_output, self._finetuning_task_name
+            )
+        elif self._model_type == "xlm":
+            from ..models.xlm.convert_xlm_original_pytorch_checkpoint_to_pytorch import (
+                convert_xlm_checkpoint_to_pytorch,
+            )
+
+            convert_xlm_checkpoint_to_pytorch(self._tf_checkpoint, self._pytorch_dump_output)
+        elif self._model_type == "lxmert":
+            from ..models.lxmert.convert_lxmert_original_tf_checkpoint_to_pytorch import (
+                convert_lxmert_checkpoint_to_pytorch,
+            )
+
+            convert_lxmert_checkpoint_to_pytorch(self._tf_checkpoint, self._pytorch_dump_output)
+        elif self._model_type == "rembert":
+            from ..models.rembert.convert_rembert_tf_checkpoint_to_pytorch import (
+                convert_rembert_tf_checkpoint_to_pytorch,
+            )
+
+            convert_rembert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
+        else:
+            raise ValueError("--model_type should be selected in the list [bert, gpt, gpt2, t5, xlnet, xlm, lxmert]")
diff --git a/venv/lib/python3.13/site-packages/transformers/commands/download.py b/venv/lib/python3.13/site-packages/transformers/commands/download.py
new file mode 100644
index 0000000000000000000000000000000000000000..8af3c6397b442f1016640c51b4c54cfd9921fd6a
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/commands/download.py
@@ -0,0 +1,56 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from argparse import ArgumentParser
+
+from . import BaseTransformersCLICommand
+
+
+def download_command_factory(args):
+    return DownloadCommand(args.model, args.cache_dir, args.force, args.trust_remote_code)
+
+
+class DownloadCommand(BaseTransformersCLICommand):
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        download_parser = parser.add_parser("download")
+        download_parser.add_argument(
+            "--cache-dir", type=str, default=None, help="Path to location to store the models"
+        )
+        download_parser.add_argument(
+            "--force", action="store_true", help="Force the model to be download even if already in cache-dir"
+        )
+        download_parser.add_argument(
+            "--trust-remote-code",
+            action="store_true",
+            help="Whether or not to allow for custom models defined on the Hub in their own modeling files. Use only if you've reviewed the code as it will execute on your local machine",
+        )
+        download_parser.add_argument("model", type=str, help="Name of the model to download")
+        download_parser.set_defaults(func=download_command_factory)
+
+    def __init__(self, model: str, cache: str, force: bool, trust_remote_code: bool):
+        self._model = model
+        self._cache = cache
+        self._force = force
+        self._trust_remote_code = trust_remote_code
+
+    def run(self):
+        from ..models.auto import AutoModel, AutoTokenizer
+
+        AutoModel.from_pretrained(
+            self._model, cache_dir=self._cache, force_download=self._force, trust_remote_code=self._trust_remote_code
+        )
+        AutoTokenizer.from_pretrained(
+            self._model, cache_dir=self._cache, force_download=self._force, trust_remote_code=self._trust_remote_code
+        )
diff --git a/venv/lib/python3.13/site-packages/transformers/commands/env.py b/venv/lib/python3.13/site-packages/transformers/commands/env.py
new file mode 100644
index 0000000000000000000000000000000000000000..e15a699e80f601ee62649179705ae7924ae65af4
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/commands/env.py
@@ -0,0 +1,177 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import contextlib
+import io
+import os
+import platform
+from argparse import ArgumentParser
+
+import huggingface_hub
+
+from .. import __version__ as version
+from ..integrations.deepspeed import is_deepspeed_available
+from ..utils import (
+    is_accelerate_available,
+    is_flax_available,
+    is_tf_available,
+    is_torch_available,
+    is_torch_hpu_available,
+    is_torch_npu_available,
+    is_torch_xpu_available,
+)
+from . import BaseTransformersCLICommand
+
+
+def info_command_factory(_):
+    return EnvironmentCommand()
+
+
+def download_command_factory(args):
+    return EnvironmentCommand(args.accelerate_config_file)
+
+
+class EnvironmentCommand(BaseTransformersCLICommand):
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        download_parser = parser.add_parser("env")
+        download_parser.set_defaults(func=info_command_factory)
+        download_parser.add_argument(
+            "--accelerate-config_file",
+            default=None,
+            help="The accelerate config file to use for the default values in the launching script.",
+        )
+        download_parser.set_defaults(func=download_command_factory)
+
+    def __init__(self, accelerate_config_file, *args) -> None:
+        self._accelerate_config_file = accelerate_config_file
+
+    def run(self):
+        import safetensors
+
+        safetensors_version = safetensors.__version__
+
+        accelerate_version = "not installed"
+        accelerate_config = accelerate_config_str = "not found"
+
+        if is_accelerate_available():
+            import accelerate
+            from accelerate.commands.config import default_config_file, load_config_from_file
+
+            accelerate_version = accelerate.__version__
+            # Get the default from the config file.
+            if self._accelerate_config_file is not None or os.path.isfile(default_config_file):
+                accelerate_config = load_config_from_file(self._accelerate_config_file).to_dict()
+
+            accelerate_config_str = (
+                "\n".join([f"\t- {prop}: {val}" for prop, val in accelerate_config.items()])
+                if isinstance(accelerate_config, dict)
+                else f"\t{accelerate_config}"
+            )
+
+        pt_version = "not installed"
+        pt_cuda_available = "NA"
+        pt_accelerator = "NA"
+        if is_torch_available():
+            import torch
+
+            pt_version = torch.__version__
+            pt_cuda_available = torch.cuda.is_available()
+            pt_xpu_available = is_torch_xpu_available()
+            pt_npu_available = is_torch_npu_available()
+            pt_hpu_available = is_torch_hpu_available()
+
+            if pt_cuda_available:
+                pt_accelerator = "CUDA"
+            elif pt_xpu_available:
+                pt_accelerator = "XPU"
+            elif pt_npu_available:
+                pt_accelerator = "NPU"
+            elif pt_hpu_available:
+                pt_accelerator = "HPU"
+
+        tf_version = "not installed"
+        tf_cuda_available = "NA"
+        if is_tf_available():
+            import tensorflow as tf
+
+            tf_version = tf.__version__
+            try:
+                # deprecated in v2.1
+                tf_cuda_available = tf.test.is_gpu_available()
+            except AttributeError:
+                # returns list of devices, convert to bool
+                tf_cuda_available = bool(tf.config.list_physical_devices("GPU"))
+
+        deepspeed_version = "not installed"
+        if is_deepspeed_available():
+            # Redirect command line output to silence deepspeed import output.
+            with contextlib.redirect_stdout(io.StringIO()):
+                import deepspeed
+            deepspeed_version = deepspeed.__version__
+
+        flax_version = "not installed"
+        jax_version = "not installed"
+        jaxlib_version = "not installed"
+        jax_backend = "NA"
+        if is_flax_available():
+            import flax
+            import jax
+            import jaxlib
+
+            flax_version = flax.__version__
+            jax_version = jax.__version__
+            jaxlib_version = jaxlib.__version__
+            jax_backend = jax.lib.xla_bridge.get_backend().platform
+
+        info = {
+            "`transformers` version": version,
+            "Platform": platform.platform(),
+            "Python version": platform.python_version(),
+            "Huggingface_hub version": huggingface_hub.__version__,
+            "Safetensors version": f"{safetensors_version}",
+            "Accelerate version": f"{accelerate_version}",
+            "Accelerate config": f"{accelerate_config_str}",
+            "DeepSpeed version": f"{deepspeed_version}",
+            "PyTorch version (accelerator?)": f"{pt_version} ({pt_accelerator})",
+            "Tensorflow version (GPU?)": f"{tf_version} ({tf_cuda_available})",
+            "Flax version (CPU?/GPU?/TPU?)": f"{flax_version} ({jax_backend})",
+            "Jax version": f"{jax_version}",
+            "JaxLib version": f"{jaxlib_version}",
+            "Using distributed or parallel set-up in script?": "<fill in>",
+        }
+        if is_torch_available():
+            if pt_cuda_available:
+                info["Using GPU in script?"] = "<fill in>"
+                info["GPU type"] = torch.cuda.get_device_name()
+            elif pt_xpu_available:
+                info["Using XPU in script?"] = "<fill in>"
+                info["XPU type"] = torch.xpu.get_device_name()
+            elif pt_hpu_available:
+                info["Using HPU in script?"] = "<fill in>"
+                info["HPU type"] = torch.hpu.get_device_name()
+            elif pt_npu_available:
+                info["Using NPU in script?"] = "<fill in>"
+                info["NPU type"] = torch.npu.get_device_name()
+                info["CANN version"] = torch.version.cann
+
+        print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n")
+        print(self.format_dict(info))
+
+        return info
+
+    @staticmethod
+    def format_dict(d):
+        return "\n".join([f"- {prop}: {val}" for prop, val in d.items()]) + "\n"
diff --git a/venv/lib/python3.13/site-packages/transformers/commands/run.py b/venv/lib/python3.13/site-packages/transformers/commands/run.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbf067ae4d95088a1e3a46deb02825ebe0d147d8
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/commands/run.py
@@ -0,0 +1,110 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from argparse import ArgumentParser
+
+from ..pipelines import Pipeline, PipelineDataFormat, get_supported_tasks, pipeline
+from ..utils import logging
+from . import BaseTransformersCLICommand
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def try_infer_format_from_ext(path: str):
+    if not path:
+        return "pipe"
+
+    for ext in PipelineDataFormat.SUPPORTED_FORMATS:
+        if path.endswith(ext):
+            return ext
+
+    raise Exception(
+        f"Unable to determine file format from file extension {path}. "
+        f"Please provide the format through --format {PipelineDataFormat.SUPPORTED_FORMATS}"
+    )
+
+
+def run_command_factory(args):
+    nlp = pipeline(
+        task=args.task,
+        model=args.model if args.model else None,
+        config=args.config,
+        tokenizer=args.tokenizer,
+        device=args.device,
+    )
+    format = try_infer_format_from_ext(args.input) if args.format == "infer" else args.format
+    reader = PipelineDataFormat.from_str(
+        format=format,
+        output_path=args.output,
+        input_path=args.input,
+        column=args.column if args.column else nlp.default_input_names,
+        overwrite=args.overwrite,
+    )
+    return RunCommand(nlp, reader)
+
+
+class RunCommand(BaseTransformersCLICommand):
+    def __init__(self, nlp: Pipeline, reader: PipelineDataFormat):
+        self._nlp = nlp
+        self._reader = reader
+
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        run_parser = parser.add_parser("run", help="Run a pipeline through the CLI")
+        run_parser.add_argument("--task", choices=get_supported_tasks(), help="Task to run")
+        run_parser.add_argument("--input", type=str, help="Path to the file to use for inference")
+        run_parser.add_argument("--output", type=str, help="Path to the file that will be used post to write results.")
+        run_parser.add_argument("--model", type=str, help="Name or path to the model to instantiate.")
+        run_parser.add_argument("--config", type=str, help="Name or path to the model's config to instantiate.")
+        run_parser.add_argument(
+            "--tokenizer", type=str, help="Name of the tokenizer to use. (default: same as the model name)"
+        )
+        run_parser.add_argument(
+            "--column",
+            type=str,
+            help="Name of the column to use as input. (For multi columns input as QA use column1,columns2)",
+        )
+        run_parser.add_argument(
+            "--format",
+            type=str,
+            default="infer",
+            choices=PipelineDataFormat.SUPPORTED_FORMATS,
+            help="Input format to read from",
+        )
+        run_parser.add_argument(
+            "--device",
+            type=int,
+            default=-1,
+            help="Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)",
+        )
+        run_parser.add_argument("--overwrite", action="store_true", help="Allow overwriting the output file.")
+        run_parser.set_defaults(func=run_command_factory)
+
+    def run(self):
+        nlp, outputs = self._nlp, []
+
+        for entry in self._reader:
+            output = nlp(**entry) if self._reader.is_multi_columns else nlp(entry)
+            if isinstance(output, dict):
+                outputs.append(output)
+            else:
+                outputs += output
+
+        # Saving data
+        if self._nlp.binary_output:
+            binary_path = self._reader.save_binary(outputs)
+            logger.warning(f"Current pipeline requires output to be in binary format, saving at {binary_path}")
+        else:
+            self._reader.save(outputs)
diff --git a/venv/lib/python3.13/site-packages/transformers/commands/serving.py b/venv/lib/python3.13/site-packages/transformers/commands/serving.py
new file mode 100644
index 0000000000000000000000000000000000000000..970d59c96e74a60ec478dde26501b2fa8bb3cb35
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/commands/serving.py
@@ -0,0 +1,1698 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+import base64
+import copy
+import datetime
+import enum
+import functools
+import gc
+import io
+import json
+import re
+import tempfile
+import threading
+import time
+import uuid
+from argparse import ArgumentParser, Namespace
+from collections.abc import AsyncGenerator, Generator, Iterable
+from contextlib import asynccontextmanager
+from dataclasses import dataclass, field
+from io import BytesIO
+from threading import Thread
+from typing import Optional, TypedDict, Union
+
+from huggingface_hub import model_info
+from huggingface_hub.constants import HF_HUB_OFFLINE
+from tokenizers.decoders import DecodeStream
+
+import transformers
+from transformers.models.auto.modeling_auto import (
+    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
+    MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES,
+)
+from transformers.utils.import_utils import (
+    is_fastapi_available,
+    is_librosa_available,
+    is_openai_available,
+    is_pydantic_available,
+    is_uvicorn_available,
+    is_vision_available,
+)
+
+from .. import (
+    AutoConfig,
+    LogitsProcessorList,
+    PreTrainedTokenizerFast,
+    ProcessorMixin,
+    TextIteratorStreamer,
+)
+from ..utils import is_torch_available, logging
+from . import BaseTransformersCLICommand
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        AutoProcessor,
+        BitsAndBytesConfig,
+        GenerationConfig,
+        PreTrainedModel,
+    )
+
+    from ..generation.continuous_batching import ContinuousBatchingManager, RequestStatus
+
+if is_librosa_available():
+    import librosa
+
+if is_vision_available():
+    from PIL import Image
+
+serve_dependencies_available = (
+    is_pydantic_available() and is_fastapi_available() and is_uvicorn_available() and is_openai_available()
+)
+if serve_dependencies_available:
+    import uvicorn
+    from fastapi import FastAPI, HTTPException
+    from fastapi.middleware.cors import CORSMiddleware
+    from fastapi.responses import JSONResponse, StreamingResponse
+    from openai.types.audio.transcription import Transcription
+    from openai.types.audio.transcription_create_params import TranscriptionCreateParamsBase
+    from openai.types.chat import ChatCompletionMessageParam
+    from openai.types.chat.chat_completion_chunk import (
+        ChatCompletionChunk,
+        Choice,
+        ChoiceDelta,
+        ChoiceDeltaToolCall,
+        ChoiceDeltaToolCallFunction,
+    )
+    from openai.types.chat.completion_create_params import CompletionCreateParamsStreaming
+    from openai.types.responses import (
+        Response,
+        ResponseCompletedEvent,
+        ResponseContentPartAddedEvent,
+        ResponseContentPartDoneEvent,
+        ResponseCreatedEvent,
+        ResponseError,
+        ResponseErrorEvent,
+        ResponseFailedEvent,
+        ResponseInProgressEvent,
+        ResponseOutputItemAddedEvent,
+        ResponseOutputItemDoneEvent,
+        ResponseOutputMessage,
+        ResponseOutputText,
+        ResponseTextDeltaEvent,
+        ResponseTextDoneEvent,
+    )
+    from openai.types.responses.response_create_params import ResponseCreateParamsStreaming
+    from pydantic import BaseModel, TypeAdapter, ValidationError
+
+    # Expand OpenAI's request input types with an optional `generation_config` field
+    class TransformersResponseCreateParamsStreaming(ResponseCreateParamsStreaming, total=False):
+        """
+        OpenAI's ResponseCreateParamsStreaming with an additional field for the generation config (as a json string).
+        """
+
+        generation_config: str
+
+    class TransformersCompletionCreateParamsStreaming(CompletionCreateParamsStreaming, total=False):
+        """
+        OpenAI's CompletionCreateParamsStreaming with additional fields for the generation config (as a json string) and passing the request_id
+        """
+
+        generation_config: str
+
+    class TransformersTranscriptionCreateParams(TranscriptionCreateParamsBase, total=False):
+        """
+        OpenAI's TranscriptionCreateParamsBase with an additional field for the generation config (as a json string).
+        """
+
+        file: bytes  # Overwritten -- pydantic isn't happy with `typing.IO[bytes]`, present in the original type
+        generation_config: str
+        stream: bool = False
+
+    # Contrarily to OpenAI's output types, input types are `TypedDict`, which don't have built-in validation.
+    response_validator = TypeAdapter(TransformersResponseCreateParamsStreaming)
+    completion_validator = TypeAdapter(TransformersCompletionCreateParamsStreaming)
+    transcription_validator = TypeAdapter(TransformersTranscriptionCreateParams)
+
+    # Define request fields that are not yet used in `transformers serve`. Receiving these fields will raise an
+    # HTTPException.
+    UNUSED_RESPONSE_FIELDS = {
+        "background",
+        "include",
+        "max_tool_calls",
+        "previous_response_id",
+        "prompt",
+        "reasoning",
+        "service_tier",
+        "store",
+        "text",
+        "tool_choice",
+        "top_logprobs",
+        "truncation",
+        "user",
+    }
+
+    UNUSED_CHAT_COMPLETION_FIELDS = {
+        "audio",
+        "function_call",
+        "functions",
+        "logprobs",
+        "max_completion_tokens",
+        "metadata",
+        "modalities",
+        "n",
+        "parallel_tool_calls",
+        "prediction",
+        "presence_penalty",
+        "reasoning_effort",
+        "response_format",
+        "service_tier",
+        "stop",
+        "store",
+        "stream_options",
+        "tool_choice",
+        "top_logprobs",
+        "user",
+        "web_search_options",
+    }
+    UNUSED_TRANSCRIPTION_FIELDS = {
+        "chunking_strategy",
+        "include",
+        "language",
+        "prompt",
+        "response_format",
+        "timestamp_granularities",
+    }
+
+
+logger = logging.get_logger(__name__)
+
+# Possible tokens that indicate the start/end of a tool call
+# TODO (joao, matt): streamline tool token detection logic
+_TOOL_CALL_TOKENS = {
+    "qwen": {
+        "start": "<tool_call>",
+        "end": "</tool_call>",
+    },
+}
+_MODELS_WITH_TOOL_SUPPORT = list(_TOOL_CALL_TOKENS.keys())
+
+X_REQUEST_ID = "x-request-id"
+
+
+class Modality(enum.Enum):
+    LLM = "LLM"
+    VLM = "VLM"
+    STT = "STT"
+    TTS = "TTS"
+
+
+def serve_command_factory(args: Namespace):
+    """
+    Factory function used to instantiate serving server from provided command line arguments.
+
+    Returns: ServeCommand
+    """
+    return ServeCommand(args)
+
+
+def create_generation_config_from_req(
+    req: dict,
+    model_generation_config: "GenerationConfig",
+    **kwargs,
+) -> "GenerationConfig":
+    """
+    Creates a generation config from the parameters of the request. If a generation config is passed in the request,
+    it will be used as a baseline for parameterization. Otherwise, we will use the model's default generation config.
+    Other parameters in the request will be applied on top of the baseline.
+
+    Args:
+        req (`dict`):
+            The request which may optionally contain generation parameters.
+        model_generation_config (`GenerationConfig`):
+            The model's default generation config.
+        kwargs (`dict`):
+            Additional parameters to set in the generation config.
+
+    Returns:
+        The prepared `GenerationConfig` object.
+    """
+    # If there is a generation config in the request, it is a json string serialization from a `GenerationConfig`
+    # object. For simplicity, flags set here take precedence over all other flags.
+    if req.get("generation_config") is not None:
+        generation_config = GenerationConfig(**json.loads(req["generation_config"]))
+    else:
+        generation_config = copy.deepcopy(model_generation_config)
+
+    non_standard_kwargs = generation_config.update(**kwargs)
+    # Set extra kwargs that are not in the `GenerationConfig` class (e.g. continuous batching flags)
+    for k, v in non_standard_kwargs.items():
+        if v is not None:
+            setattr(generation_config, k, v)
+
+    # Response-specific parameters
+    if req.get("max_output_tokens") is not None:
+        generation_config.max_new_tokens = int(req["max_output_tokens"])
+
+    # Completion-specific parameters
+    if req.get("max_tokens") is not None:
+        generation_config.max_new_tokens = int(req["max_tokens"])
+    if req.get("frequency_penalty") is not None:
+        generation_config.repetition_penalty = float(req["frequency_penalty"])
+    if req.get("logit_bias") is not None:
+        generation_config.sequence_bias = req["logit_bias"]
+    if req.get("stop") is not None:
+        generation_config.stop_strings = req["stop"]
+    if req.get("temperature") is not None:
+        generation_config.temperature = float(req["temperature"])
+        if float(req["temperature"]) == 0.0:
+            generation_config.do_sample = False
+    if req.get("top_p") is not None:
+        generation_config.top_p = float(req["top_p"])
+    if req.get("seed") is not None:
+        torch.manual_seed(req["seed"])
+
+    return generation_config
+
+
+class ToolState:
+    """Lightweight class to keep track of the tool call state."""
+
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        """Reset the tool call state (assumes we're outside a tool call)."""
+        self.inside_tool_call = False
+        self.has_tool_name_defined = False
+        self.arg_nesting_level = 0
+        self.buffer = ""
+
+
+class TimedModel:
+    """
+    A class that holds a PreTrainedModel instance and its associated processor.
+    Automatically deletes the instances after a specified timeout.
+    """
+
+    def __init__(
+        self,
+        model: "PreTrainedModel",
+        timeout_seconds: int,
+        processor: Optional[Union["ProcessorMixin", "PreTrainedTokenizerFast"]] = None,
+    ):
+        self.model = model
+        self._name_or_path = str(model.name_or_path)
+        self.processor = processor
+        self.timeout_seconds = timeout_seconds
+        self._timer = threading.Timer(self.timeout_seconds, self.timeout_reached)
+        self._timer.start()
+
+    def reset_timer(self):
+        """Reset the timer for the deletion of the instances."""
+        self._timer.cancel()
+        self._timer = threading.Timer(self.timeout_seconds, self.timeout_reached)
+        self._timer.start()
+
+    def delete_model(self):
+        """Delete the wrapped model and processor and clean up resources."""
+        if hasattr(self, "model") and self.model is not None:
+            del self.model
+            del self.processor
+            self.model = None
+            self.processor = None
+            gc.collect()
+
+            # Clear CUDA cache if available
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
+            # XXX: in case we manually delete the model, like on server shutdown
+            self._timer.cancel()
+
+    def timeout_reached(self):
+        self.delete_model()
+        logger.info(f"{self._name_or_path} was removed from memory after {self.timeout_seconds} seconds of inactivity")
+
+    def is_deleted(self):
+        """Check if the instances have been deleted."""
+        return not hasattr(self, "model") or self.model is None
+
+
+@dataclass
+class ServeArguments:
+    r"""
+    Arguments for the serve CLI.
+
+    See the metadata arg for each argument's description -- the metadata will be printed with
+    `transformers serve --help`
+    """
+
+    continuous_batching: bool = field(
+        default=False,
+        metadata={"help": "Whether to use continuous batching for chat completions."},
+    )
+    device: str = field(
+        default="auto",
+        metadata={
+            "help": "Device to use for inference; will default to `auto` and"
+            "place the model on an accelerator if available."
+        },
+    )
+    torch_dtype: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "`torch_dtype` is deprecated! Please use `dtype` argument instead.",
+            "choices": ["auto", "bfloat16", "float16", "float32"],
+        },
+    )
+    dtype: Optional[str] = field(
+        default="auto",
+        metadata={
+            "help": "Override the default `torch.dtype` and load the model under this dtype. If `'auto'` is passed, "
+            "the dtype will be automatically derived from the model's weights.",
+            "choices": ["auto", "bfloat16", "float16", "float32"],
+        },
+    )
+    trust_remote_code: bool = field(
+        default=False, metadata={"help": "Whether to trust remote code when loading a model."}
+    )
+    attn_implementation: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Which attention implementation to use; you can run --attn_implementation=flash_attention_2, in "
+            "which case you must install this manually by running `pip install flash-attn --no-build-isolation`."
+        },
+    )
+    load_in_8bit: bool = field(
+        default=False,
+        metadata={"help": "Whether to use 8 bit precision for the base model - works only with LoRA."},
+    )
+    load_in_4bit: bool = field(
+        default=False,
+        metadata={"help": "Whether to use 4 bit precision for the base model - works only with LoRA."},
+    )
+    bnb_4bit_quant_type: str = field(default="nf4", metadata={"help": "Quantization type.", "choices": ["fp4", "nf4"]})
+    use_bnb_nested_quant: bool = field(default=False, metadata={"help": "Whether to use nested quantization."})
+
+    # Serving settings
+    host: str = field(default="localhost", metadata={"help": "Interface the server will listen to."})
+    port: int = field(default=8000, metadata={"help": "Port the server will listen to."})
+    model_timeout: int = field(
+        default=300,
+        metadata={"help": "Time in seconds after which a model will be removed from memory."},
+    )
+
+    # Other settings
+    log_level: str = field(
+        default="info", metadata={"help": "Logging level as a string. Example: 'info' or 'warning'."}
+    )
+    default_seed: Optional[int] = field(
+        default=None, metadata={"help": "The default seed for torch, should be an integer."}
+    )
+    enable_cors: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to enable CORS. Some apps that make requests from external domains (e.g. Cursor) require "
+                "CORS to be enabled."
+            ),
+        },
+    )
+
+    # TODO
+    # Testing
+    # As of 2025-07-11, testing on https://github.com/openai/openai-responses-starter-app/, validation on the
+    # Response input is failing. The app works well without validation. Enable at some point in the future.
+    input_validation: bool = field(
+        default=False,
+        metadata={
+            "help": ("Whether to turn on strict input validation."),
+        },
+    )
+    force_model: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Name of the model to be forced on all requests. This is useful for testing Apps that don't allow "
+                "changing models in the request."
+            ),
+        },
+    )
+
+    def __post_init__(self):
+        """Only used for BC `torch_dtype` argument."""
+        # In this case only the BC torch_dtype was given
+        if self.torch_dtype is not None:
+            if self.dtype is None:
+                self.dtype = self.torch_dtype
+            elif self.torch_dtype != self.dtype:
+                raise ValueError(
+                    f"`torch_dtype` {self.torch_dtype} and `dtype` {self.dtype} have different values. `torch_dtype` is deprecated and "
+                    "will be removed in 4.59.0, please set `dtype` instead."
+                )
+
+
+class ServeCommand(BaseTransformersCLICommand):
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        """
+        Register this command to argparse so it's available for the transformer-cli
+
+        Args:
+            parser: Root parser to register command-specific arguments
+        """
+        dataclass_types = (ServeArguments,)
+        serve_parser = parser.add_parser("serve", dataclass_types=dataclass_types)
+        serve_parser.set_defaults(func=serve_command_factory)
+
+    def __init__(self, args: ServeArguments):
+        if not serve_dependencies_available:
+            raise ImportError(
+                "Missing dependencies for the serving CLI. Please install with `pip install transformers[serving]`"
+            )
+
+        # Store and process input arguments
+        self.args = args
+        self.use_continuous_batching = self.args.continuous_batching
+        if self.use_continuous_batching:
+            default_attn_impl = ContinuousBatchingManager.default_attention_implementation()
+            # checking if attn_implementation is supported by continuous batching
+            if self.args.attn_implementation is None:
+                self.args.attn_implementation = default_attn_impl  # default to sdpa_paged
+                logger.info(f"No attn_implementation passed, defaulting to {default_attn_impl}")
+            supported_attn_impl = ContinuousBatchingManager.supported_attention_implementations()
+            if self.args.attn_implementation not in supported_attn_impl:
+                raise ValueError(
+                    f"Continuous batching only supports {supported_attn_impl} as attn_implementation, got "
+                    f"{self.args.attn_implementation}"
+                    f"Try setting `--attn_implementation={default_attn_impl}`"
+                )
+        self.enable_cors = self.args.enable_cors
+
+        if self.args.default_seed is not None:
+            torch.manual_seed(self.args.default_seed)
+
+        # Set up logging
+        transformers_logger = logging.get_logger("transformers")
+        transformers_logger.setLevel(logging.log_levels[self.args.log_level.lower()])
+
+        cb_logger = logging.get_logger("transformers.generation.continuous_batching")
+        cb_logger.setLevel(logging.log_levels[self.args.log_level.lower()])
+
+        # Internal state:
+        # 1. Tracks models in memory, to prevent reloading the model unnecessarily
+        self.loaded_models: dict[str, TimedModel] = {}
+        self.running_continuous_batching_manager: Optional[ContinuousBatchingManager] = None
+
+        # 2. preserves information about the last call and last KV cache, to determine whether we can reuse the KV
+        # cache and avoid re-running prefil
+        self.last_messages = None
+        self.last_kv_cache = None
+        self.last_model = None
+
+    def _validate_request(
+        self,
+        request: dict,
+        schema: TypedDict,
+        validator: "TypeAdapter",
+        unused_fields: set,
+    ):
+        """
+        Validates the request against the schema, and checks for unexpected keys.
+
+        Args:
+            request (`dict`):
+                The request to validate.
+            schema (`TypedDict`):
+                The schema of the request to validate. It is a `TypedDict` definition.
+            validator (`TypeAdapter`):
+                The validator to use to validate the request. Built from `schema`.
+            unused_fields (`set`):
+                Fields accepted by `schema`, but not used in `transformers serve`.
+
+        Raises:
+            HTTPException: If the request is invalid or contains unexpected or unused fields.
+        """
+        logger.debug(f"Validating request: {request}")
+
+        # Validate unexpected keys -- Pydantic doesn't validate extra keys in the request.
+        input_keys = set(request.keys())
+        possible_keys = schema.__mutable_keys__
+        unexpected_keys = input_keys - possible_keys
+        if unexpected_keys:
+            logger.error(f"Unexpected keys in the request: {unexpected_keys}")
+            raise HTTPException(status_code=422, detail=f"Unexpected keys in the request: {unexpected_keys}")
+
+        if self.args.input_validation:
+            # Validate expected keys
+            try:
+                validator.validate_python(request)
+            except ValidationError as e:
+                logger.error(f"Validation error: {e.errors()}")
+                raise HTTPException(status_code=422, detail=e.errors())
+
+            # Validate unused fields
+            unused_fields_in_request = input_keys & unused_fields
+            if unused_fields_in_request:
+                logger.error(f"Unused fields in the request: {unused_fields_in_request}")
+                raise HTTPException(
+                    status_code=422, detail=f"Unused fields in the request: {unused_fields_in_request}"
+                )
+
+    def validate_response_request(self, request: dict):
+        self._validate_request(
+            request=request,
+            schema=TransformersResponseCreateParamsStreaming,
+            validator=response_validator,
+            unused_fields=UNUSED_RESPONSE_FIELDS,
+        )
+
+    def validate_chat_completion_request(self, request: dict):
+        self._validate_request(
+            request=request,
+            schema=TransformersCompletionCreateParamsStreaming,
+            validator=completion_validator,
+            unused_fields=UNUSED_CHAT_COMPLETION_FIELDS,
+        )
+
+    def validate_transcription_request(self, request: dict):
+        self._validate_request(
+            request=request,
+            schema=TransformersTranscriptionCreateParams,
+            validator=transcription_validator,
+            unused_fields=UNUSED_TRANSCRIPTION_FIELDS,
+        )
+
+    def build_chat_completion_chunk(
+        self,
+        request_id: str = "",
+        content: Optional[int] = None,
+        model: Optional[str] = None,
+        role: Optional[str] = None,
+        finish_reason: Optional[str] = None,
+        tool_calls: Optional[list["ChoiceDeltaToolCall"]] = None,
+        decode_stream: Optional[DecodeStream] = None,
+        tokenizer: Optional[PreTrainedTokenizerFast] = None,
+    ) -> str:
+        """
+        Builds a chunk of a streaming OpenAI Chat Completion response.
+
+        IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps,
+        like Cursor, assume that when the field exists, it has data.
+
+        Args:
+            request_id (`str`):
+                The request ID.
+            content (`str`, *optional*):
+                Content of the response from the model.
+            model (`str`, *optional*):
+                The model that generated the content.
+            role (`str`, *optional*):
+                The role of the next content, until a new role is defined.
+            finish_reason (`str`, *optional*):
+                The reason the generation by the model has finished.
+            tool_calls (`list[ChoiceDeltaToolCall]`, *optional*):
+                Data about the tool calls, when they are triggered.
+
+        Returns:
+            `str`: The built chunk, a string containing a JSON string with the payload.
+        """
+        if decode_stream is not None and content is not None and tokenizer is not None:
+            content = decode_stream.step(tokenizer._tokenizer, content)
+        chunk = ChatCompletionChunk(
+            id=request_id,
+            created=int(time.time()),
+            model=model,
+            choices=[
+                Choice(
+                    delta=ChoiceDelta(
+                        content=content,
+                        role=role,
+                        tool_calls=tool_calls,
+                    ),
+                    index=0,
+                    finish_reason=finish_reason,
+                )
+            ],
+            system_fingerprint="",
+            object="chat.completion.chunk",
+        )
+        return f"data: {chunk.model_dump_json(exclude_none=True)}\n\n"
+
+    def build_response_event(self, response: "BaseModel") -> str:
+        """
+        Builds a event of a streaming OpenAI Response response.
+
+        IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps,
+        like Cursor, assume that when the field exists, it has data.
+
+        Args:
+            response (`BaseModel`):
+                The response to build an event from. One of the multiple OpenAI Response output types
+
+        Returns:
+            `str`: The built chunk, a string containing a JSON string with the payload.
+        """
+        return f"data: {response.model_dump_json(exclude_none=True)}\n\n"
+
+    def run(self):
+        """
+        Setup and run the FastAPI server for transformers serve.
+
+        Models will be loaded and unloaded automatically based on usage and a timeout.
+
+        The server will expose the following endpoints:
+        - POST /v1/chat/completions: Generates chat completions.
+        - POST /v1/responses: Generates responses.
+        - POST /v1/audio/transcriptions: Generates transcriptions from audio.
+        - GET /v1/models: Lists available models for 3rd party tools.
+
+        Requires FastAPI and Uvicorn to be installed.
+        """
+
+        @asynccontextmanager
+        async def lifespan(app: FastAPI):
+            yield
+            for model in self.loaded_models.values():
+                model.delete_model()
+            if self.running_continuous_batching_manager is not None:
+                self.running_continuous_batching_manager.stop(block=True, timeout=5)
+
+        app = FastAPI(lifespan=lifespan)
+
+        # Some apps that make requests from external domains (e.g. Cursor) require CORS to be enabled. However, for
+        # security purposes, it's disabled by default
+        if self.enable_cors:
+            app.add_middleware(
+                CORSMiddleware,
+                allow_origins=["*"],
+                allow_credentials=True,
+                allow_methods=["*"],
+                allow_headers=["*"],
+            )
+            logger.warning_once(
+                "CORS allow origin is set to `*`. This is not recommended for production environments."
+            )
+
+        from fastapi import Request
+
+        @app.post("/v1/chat/completions")
+        def chat_completion(request: Request, body: dict):
+            self.validate_chat_completion_request(request=body)
+
+            if self.use_continuous_batching:
+                output = self.continuous_batching_chat_completion(body, request.state.request_id)
+            else:
+                output = self.generate_chat_completion(body)
+            return StreamingResponse(output, media_type="text/event-stream")
+
+        @app.post("/v1/responses")
+        def responses(request: dict):
+            self.validate_response_request(request=request)
+
+            output = self.generate_response(request)
+            return StreamingResponse(output, media_type="text/event-stream")
+
+        @app.post("/v1/audio/transcriptions")
+        async def audio_transcriptions(request: Request):
+            # Parses the multipart/form-data request into the request format used by other endpoints
+            async with request.form() as form:
+                parsed_request = TransformersTranscriptionCreateParams(
+                    file=await form["file"].read(),
+                    model=form["model"],
+                    # TODO: add other fields
+                )
+                logger.debug(
+                    f"Received file: {form['file'].filename}; MIME type: {form['file'].content_type}; "
+                    f"size: {form['file'].size / 1024:.2f} KiB"
+                )
+            self.validate_transcription_request(request=parsed_request)
+
+            output = self.generate_transcription(parsed_request)
+            return StreamingResponse(output, media_type="text/event-stream")
+
+        @app.options("/v1/models")
+        @app.get("/v1/models")
+        def get_all_models():
+            return JSONResponse({"object": "list", "data": self.get_gen_models()})
+
+        @app.get("/health")
+        def healthcheck():
+            return JSONResponse({"status": "ok"})
+
+        @app.middleware("http")
+        async def get_or_set_request_id(request: Request, call_next):
+            request_id = request.headers.get(X_REQUEST_ID) or str(uuid.uuid4())
+            request.state.request_id = request_id
+            response = await call_next(request)
+            response.headers[X_REQUEST_ID] = request_id
+            return response
+
+        uvicorn.run(app, host=self.args.host, port=self.args.port, log_level=self.args.log_level)
+
+    @functools.cache
+    def get_gen_models(self) -> list[dict[str, any]]:
+        """
+        This is by no means a limit to which models may be instantiated with `transformers serve`: any chat-based
+        model working with generate can work.
+
+        This is a limited list of models to ensure we have a discoverable /v1/models endpoint for third-party
+        integrations.
+        """
+        models = [
+            "Menlo/Jan-nano",
+            "Menlo/Jan-nano-128k",
+            "Qwen/Qwen2.5-0.5B-Instruct",
+            "Qwen/Qwen2.5-3B-Instruct",
+            "Qwen/Qwen2.5-7B-Instruct",
+            "Qwen/Qwen2.5-14B-Instruct",
+            "meta-llama/Llama-3.1-8B-Instruct",
+            "meta-llama/Llama-3.2-1B-Instruct",
+            "meta-llama/Llama-3.3-70B-Instruct",
+            "HuggingFaceTB/SmolVLM-Instruct",
+            "ibm-granite/granite-vision-3.2-2b",
+            "Qwen/Qwen2.5-VL-7B-Instruct",
+        ]
+
+        if HF_HUB_OFFLINE:
+            return [
+                {
+                    "id": model,
+                    "object": "model",
+                    "created": datetime.datetime.now().timestamp(),
+                    "owned_by": model.split("/")[0],
+                }
+                for model in models
+            ]
+        else:
+            model_infos = [model_info(model) for model in models]
+            return [
+                {
+                    "id": model.id,
+                    "object": "model",
+                    "created": model.created_at.timestamp(),
+                    "owned_by": model.author,
+                }
+                for model in model_infos
+            ]
+
+    def continuous_batching_chat_completion(self, req: dict, request_id: str) -> AsyncGenerator[str, None]:
+        """
+        Generates an OpenAI Chat Completion using continuous batching.
+
+        Args:
+            req (`dict`): The request to generate an OpenAI Chat Completion for.
+
+        Returns:
+            `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks.
+        """
+
+        model_id_and_revision = self.process_model_name(req["model"])
+        must_discard_cache = model_id_and_revision != self.last_model
+        self.last_model = model_id_and_revision
+        if must_discard_cache:
+            # When switching models, terminate a continuous batching manager if it is running.
+            if self.running_continuous_batching_manager is not None:
+                self.running_continuous_batching_manager.stop(block=True, timeout=2)
+                self.running_continuous_batching_manager = None
+        model, processor = self.load_model_and_processor(model_id_and_revision)
+
+        tokenizer = processor.tokenizer if hasattr(processor, "tokenizer") else processor
+
+        generation_config = create_generation_config_from_req(
+            req,
+            model_generation_config=model.generation_config,
+            eos_token_id=tokenizer.eos_token_id,
+            pad_token_id=tokenizer.pad_token_id,
+            use_cache=False,
+            do_sample=False,
+            scheduler="fifo",
+        )
+
+        if self.running_continuous_batching_manager is None:
+            self.running_continuous_batching_manager = model.init_continuous_batching(
+                generation_config=generation_config, streaming=True
+            )
+
+            # TODO (Joao, Lysandre): the logits processors should be fixed in continuous batching
+            # and correctly applied in non-cb
+            self.running_continuous_batching_manager.logit_processor = LogitsProcessorList()
+            self.running_continuous_batching_manager.start()
+
+        # TODO (Joao, Lysandre): this should also work with tool support
+        inputs = processor.apply_chat_template(req["messages"], return_tensors="pt", add_generation_prompt=True).to(
+            model.device
+        )
+
+        def stream_chat_completion(request_id, decode_stream):
+            try:
+                # Emit the assistant role to start the stream. Other chunks won't have a role, as it is implicit
+                # they come from the assistant.
+                yield self.build_chat_completion_chunk(request_id, role="assistant", model=model_id_and_revision)
+
+                for result in self.running_continuous_batching_manager.request_id_iter(request_id):
+                    if result.status == RequestStatus.FINISHED:
+                        yield self.build_chat_completion_chunk(
+                            request_id,
+                            finish_reason="stop",
+                            model=model_id_and_revision,
+                        )
+                        break
+                    else:
+                        yield self.build_chat_completion_chunk(
+                            request_id=request_id,
+                            content=result.generated_tokens[-1],
+                            model=model_id_and_revision,
+                            decode_stream=decode_stream,
+                            tokenizer=tokenizer,
+                        )
+
+            except Exception as e:
+                logger.error(str(e))
+                self.running_continuous_batching_manager.cancel_request(request_id)
+                yield f'data: {{"error": "{str(e)}"}}'
+
+        async def cancellation_wrapper(_inputs, request_id):
+            try:
+                decode_stream = DecodeStream(_inputs.tolist(), False)
+                # XXX: using returned request_id as safety in case it is None
+                request_id = self.running_continuous_batching_manager.add_request(
+                    _inputs, request_id=request_id, max_new_tokens=generation_config.max_new_tokens
+                )
+                for chunk in stream_chat_completion(request_id, decode_stream):
+                    yield chunk
+                    await asyncio.sleep(0)  # Yield control to the event loop to check for cancellations
+            except asyncio.CancelledError:
+                self.running_continuous_batching_manager.cancel_request(request_id)
+                logger.warning(f"Request {request_id} was cancelled.")
+
+        return cancellation_wrapper(inputs[0], request_id)
+
+    @staticmethod
+    def get_model_modality(model: "PreTrainedModel") -> Modality:
+        model_classname = model.__class__.__name__
+        if model_classname in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values():
+            modality = Modality.VLM
+        elif model_classname in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
+            modality = Modality.LLM
+        else:
+            raise ValueError(f"Unknown modality: {model_classname}")
+
+        return modality
+
+    @staticmethod
+    def get_processor_inputs_from_inbound_messages(messages, modality: Modality):
+        processor_inputs = []
+
+        for message in messages:
+            parsed_message = {"role": message["role"], "content": []}
+
+            if modality == Modality.LLM:
+                # Input: `content` is a string or a list of dictionaries with a "text" key.
+                # Output: `content` is a string.
+                if isinstance(message["content"], str):
+                    parsed_content = message["content"]
+                elif isinstance(message["content"], list):
+                    parsed_content = []
+                    for content in message["content"]:
+                        if content["type"] == "text":
+                            parsed_content.append(content["text"])
+                    parsed_content = " ".join(parsed_content)
+                parsed_message["content"] = parsed_content
+
+            elif modality == Modality.VLM:
+                # Input: `content` is a string or a list of dictionaries with a "type" key (possible types: "text",
+                # "image_url").
+                # Output: `content` is a list of dictionaries with a "type" key
+                if isinstance(message["content"], str):
+                    parsed_message["content"].append({"type": "text", "text": message["content"]})
+                else:
+                    for content in message["content"]:
+                        if content["type"] == "text":
+                            parsed_message["content"].append(content)
+                        elif content["type"] == "image_url":
+                            if "base64" in content["image_url"]["url"]:
+                                image_data = re.sub("^data:image/.+;base64,", "", content["image_url"]["url"])
+                                image = Image.open(BytesIO(base64.b64decode(image_data)))
+
+                                file = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
+                                url = file.name
+
+                                image.save(file.name)
+                            else:
+                                url = content["image_url"]["url"]
+
+                            parsed_message["content"].append({"type": "image", "url": url})
+            processor_inputs.append(parsed_message)
+        return processor_inputs
+
+    def generate_chat_completion(self, req: dict) -> Generator[str, None, None]:
+        """
+        Generates an OpenAI Chat Completion using `generate`.
+
+        Args:
+            req (`dict`): The request to generate an OpenAI Chat Completion for.
+
+        Returns:
+            `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks.
+        """
+        if self.args.force_model is not None:
+            req["model"] = self.args.force_model
+
+        messages: Iterable[ChatCompletionMessageParam] = req["messages"]
+
+        # HACK for tiny-agents: it sends a request after the assistant message (???). Let's assume we can't have a
+        # request whose last message is from the assistant.
+        if messages[-1]["role"] == "assistant":
+            return
+
+        model_id_and_revision = self.process_model_name(req["model"])
+        must_discard_cache = model_id_and_revision != self.last_model
+
+        self.last_model = model_id_and_revision
+        model, processor = self.load_model_and_processor(model_id_and_revision)
+
+        modality = self.get_model_modality(model)
+        processor_inputs = self.get_processor_inputs_from_inbound_messages(messages, modality)
+
+        # ====== TOOL PREPROCESSING LOGIC ======
+        tool_model_family = None
+        for supported_model_families in _MODELS_WITH_TOOL_SUPPORT:
+            if supported_model_families in model.config.architectures[0].lower():
+                tool_model_family = supported_model_families
+                break
+        # TODO: trigger 2 constrained generations after the tool call start token is emitted:
+        # 1. force generation to pick from the tool names
+        # 2. force generation to pick from that tool's arguments
+        # ====== END OF TOOL PREPROCESSING LOGIC ======
+
+        inputs = processor.apply_chat_template(
+            processor_inputs,
+            add_generation_prompt=True,
+            tools=req.get("tools"),
+            return_tensors="pt",
+            return_dict=True,
+            tokenize=True,
+        )
+        inputs = inputs.to(model.device)
+        request_id = req.get("request_id", "req_0")
+
+        # Temporary hack for GPTOSS 1: don't filter special tokens
+        skip_special_tokens = True
+        if "gptoss" in model.config.architectures[0].lower():
+            skip_special_tokens = False
+
+        generation_streamer = TextIteratorStreamer(
+            processor,
+            skip_special_tokens=skip_special_tokens,
+            skip_prompt=True,
+        )
+        generation_config = create_generation_config_from_req(req, model_generation_config=model.generation_config)
+
+        last_kv_cache = None
+        if self.is_continuation(req) and not must_discard_cache:
+            seq_len = self.last_kv_cache.get_seq_length()
+            if inputs["input_ids"].shape[-1] > seq_len:
+                last_kv_cache = self.last_kv_cache
+
+        generation_kwargs = {
+            **inputs,
+            "streamer": generation_streamer,
+            "generation_config": generation_config,
+            "return_dict_in_generate": True,
+            "past_key_values": last_kv_cache,
+        }
+
+        def stream_chat_completion(streamer, _request_id):
+            # Temporary hack for GPTOS 2: filter out the CoT tokens. Full solution here implies defining new output
+            # classes and piping the reasoning trace into a new field
+            filter_cot = False
+            cot_trace_end = None
+            if "gptoss" in model.config.architectures[0].lower():
+                filter_cot = True
+                cot_trace_end = "<|channel|>final<|message|>"
+
+            # Thin wrapper to save the KV cache after generation
+            def generate_with_cache(**kwargs):
+                generate_output = model.generate(**kwargs)
+                self.last_kv_cache = generate_output.past_key_values
+
+            thread = Thread(target=generate_with_cache, kwargs=generation_kwargs)
+            results = ""
+
+            try:
+                thread.start()
+                tool_state = ToolState()
+
+                # Emit the assistant role to start the stream. Other chunks won't have a role, as it is implicit
+                # they come from the assistant.
+                yield self.build_chat_completion_chunk(request_id, role="assistant", model=model_id_and_revision)
+
+                for result in streamer:
+                    # Temporary hack for GPTOS 3: don't emit the final "<|return|>"
+                    if "gptoss" in model.config.architectures[0].lower():
+                        result = result.removesuffix("<|return|>")
+                    results += result
+
+                    # (related to temporary hack 2)
+                    if filter_cot:
+                        if cot_trace_end in results:  # end of reasoning trace observed -> stop filtering
+                            filter_cot = False
+                            continue
+                        else:
+                            continue
+
+                    # ====== TOOL CALL LOGIC ======
+                    if tool_model_family is not None:
+                        # Start of a tool call: reset state variables, set `inside_tool_call`
+                        if result.strip() == _TOOL_CALL_TOKENS[tool_model_family]["start"]:
+                            tool_state.inside_tool_call = True
+                            continue
+
+                        # End of tool call: reset `inside_tool_call`, emit a `finish_reason`
+                        if result.strip() == _TOOL_CALL_TOKENS[tool_model_family]["end"]:
+                            tool_state.reset()
+                            yield self.build_chat_completion_chunk(
+                                request_id=_request_id,
+                                role=None,
+                                finish_reason="tool_calls",
+                                model=model_id_and_revision,
+                            )
+
+                            continue
+                        # Inside a tool call
+                        if tool_state.inside_tool_call:
+                            tool_state.buffer += result
+
+                            # First step: extract the tool name (may need several tokens, and we can't emit a delta
+                            # until we have the full name)
+                            if not tool_state.has_tool_name_defined:
+                                tool_name = re.search(r"\"name\": \"(.*?)\"", tool_state.buffer)
+                                if tool_name is None:
+                                    continue
+                                else:
+                                    tool_name = tool_name.group(1)
+                                tool_state.has_tool_name_defined = True
+                                tool = ChoiceDeltaToolCall(
+                                    function=ChoiceDeltaToolCallFunction(name=tool_name),
+                                    index=0,
+                                    type="function",
+                                    id=_request_id + "_tool_call",  # Only the first tool call delta has an id
+                                )
+
+                            # Second step: extract tool arguments. The tool arguments can be seen as a json string
+                            # within the tool json string. We emit a delta for the arguments.
+                            else:
+                                # Empty text: skip
+                                if result == "":
+                                    continue
+                                # Until we see the `"arguments": {` in the buffer, we skip
+                                # TODO: other models will likely need more elaborate processing here
+                                if '"arguments": {' not in tool_state.buffer:
+                                    continue
+
+                                # Handle nesting. We want to exclude the last } from the emitted arguments (it's
+                                # closing the outermost nesting level, outside the arguments block)
+                                tool_state.arg_nesting_level += result.count("{")
+                                tool_state.arg_nesting_level -= result.count("}")
+                                if tool_state.arg_nesting_level < 0:
+                                    result = "".join(result.split("}")[:-2]) + "}"  # e.g. "4}}\n" -> "4}"
+
+                                tool = ChoiceDeltaToolCall(
+                                    function=ChoiceDeltaToolCallFunction(arguments=result),
+                                    index=0,
+                                    type="function",
+                                )
+
+                            yield self.build_chat_completion_chunk(
+                                request_id=_request_id, role=None, tool_calls=[tool], model=model_id_and_revision
+                            )
+                            continue
+                    # ====== END OF TOOL CALL LOGIC ======
+
+                    # All non-tool related tokens are emitted as assistant messages. Empty text is skipped.
+                    if result != "":
+                        yield self.build_chat_completion_chunk(
+                            _request_id, content=result, model=model_id_and_revision
+                        )
+                yield self.build_chat_completion_chunk(_request_id, finish_reason="stop", model=model_id_and_revision)
+
+                thread.join()
+            except Exception as e:
+                logger.error(str(e))
+                yield f'data: {{"error": "{str(e)}"}}'
+
+            finally:
+                thread.join()
+
+        return stream_chat_completion(generation_streamer, request_id)
+
+    def generate_response(self, req: dict) -> Generator[str, None, None]:
+        """
+        Generates an OpenAI Response using `generate`.
+
+        Args:
+            req (`dict`): The request to generate an OpenAI Response for.
+
+        Returns:
+            `Generator[str, None, None]`: A generator that yields the OpenAI Response events.
+        """
+        # TODO -- Implement non-streaming mode
+        model_id_and_revision = self.process_model_name(req["model"])
+        must_discard_cache = model_id_and_revision != self.last_model
+        self.last_model = model_id_and_revision
+        model, processor = self.load_model_and_processor(model_id_and_revision)
+
+        if isinstance(req["input"], str):
+            inputs = [{"role": "system", "content": req["instructions"]}] if "instructions" in req else []
+            inputs.append({"role": "user", "content": req["input"]})
+        elif isinstance(req["input"], list):
+            if "instructions" in req:
+                if req["input"][0]["role"] != "system":
+                    inputs = [{"role": "system", "content": req["instructions"]}, *req["input"]]
+                else:
+                    inputs = req["input"]
+                    inputs[0]["content"] = req["instructions"]
+            else:
+                inputs = req["input"]
+        elif isinstance(req["input"], dict):
+            inputs = [{"role": "system", "content": req["instructions"]}] if "instructions" in req else []
+            inputs.append(req["input"])
+        else:
+            raise ValueError("inputs should be a list, dict, or str")
+
+        inputs = processor.apply_chat_template(inputs, add_generation_prompt=True, return_tensors="pt")
+        inputs = inputs.to(model.device)
+        request_id = req.get("previous_response_id", "req_0")
+
+        # Temporary hack for GPTOSS 1: don't filter special tokens
+        skip_special_tokens = True
+        if "gptoss" in model.config.architectures[0].lower():
+            skip_special_tokens = False
+
+        generation_streamer = TextIteratorStreamer(
+            processor,
+            skip_special_tokens=skip_special_tokens,
+            skip_prompt=True,
+        )
+        generation_config = create_generation_config_from_req(req, model_generation_config=model.generation_config)
+
+        last_kv_cache = None
+        if self.is_continuation(req) and not must_discard_cache:
+            seq_len = self.last_kv_cache.get_seq_length()
+            if inputs["input_ids"].shape[-1] > seq_len:
+                last_kv_cache = self.last_kv_cache
+
+        generation_kwargs = {
+            "inputs": inputs,
+            "attention_mask": torch.ones_like(inputs),
+            "streamer": generation_streamer,
+            "generation_config": generation_config,
+            "return_dict_in_generate": True,
+            "past_key_values": last_kv_cache,
+        }
+
+        def stream_response(streamer, _request_id):
+            # Temporary hack for GPTOS 2: filter out the CoT tokens. Full solution here implies defining new output
+            # classes and piping the reasoning trace into a new field
+            filter_cot = False
+            cot_trace_end = None
+            if "gptoss" in model.config.architectures[0].lower():
+                filter_cot = True
+                cot_trace_end = "<|channel|>final<|message|>"
+
+            # Thin wrapper to save the KV cache after generation
+            def generate_with_cache(**kwargs):
+                generate_output = model.generate(**kwargs)
+                self.last_kv_cache = generate_output.past_key_values
+
+            thread = Thread(target=generate_with_cache, kwargs=generation_kwargs)
+            sequence_number = 0
+            output_index = 0
+            content_index = 0
+
+            try:
+                thread.start()
+                created_at = time.time()  # the spec expects a unix timestamp in seconds
+
+                # We start by acknowledging the request (the request has `status="queued"`), and then by moving it to
+                # in progress (`status="in_progress"`)
+                response_created = ResponseCreatedEvent(
+                    type="response.created",
+                    sequence_number=sequence_number,
+                    response=Response(
+                        id=f"resp_{request_id}",
+                        created_at=created_at,
+                        status="queued",
+                        model=model_id_and_revision,
+                        instructions=req.get("instructions"),
+                        text={"format": {"type": "text"}},
+                        object="response",
+                        tools=[],
+                        output=[],
+                        parallel_tool_calls=req.get("parallel_tool_calls", False),
+                        tool_choice="auto",
+                        metadata=req.get("metadata"),
+                    ),
+                )
+                sequence_number += 1
+                yield self.build_response_event(response_created)
+
+                response_in_progress = ResponseInProgressEvent(
+                    type="response.in_progress",
+                    sequence_number=sequence_number,
+                    response=Response(
+                        id=f"resp_{request_id}",
+                        created_at=created_at,
+                        status="in_progress",
+                        model=model_id_and_revision,
+                        instructions=req.get("instructions"),
+                        text={"format": {"type": "text"}},
+                        object="response",
+                        tools=[],
+                        output=[],
+                        parallel_tool_calls=req.get("parallel_tool_calls", False),
+                        tool_choice="auto",
+                        metadata=req.get("metadata"),
+                    ),
+                )
+                sequence_number += 1
+                yield self.build_response_event(response_in_progress)
+
+                # Start the output item. Emit the assistant role to start the stream. Other chunks won't have a role,
+                # as it is implicit
+                response_output_item_added = ResponseOutputItemAddedEvent(
+                    type="response.output_item.added",
+                    sequence_number=sequence_number,
+                    output_index=output_index,
+                    item=ResponseOutputMessage(
+                        id=f"msg_{request_id}", type="message", status="in_progress", role="assistant", content=[]
+                    ),
+                )
+                sequence_number += 1
+                yield self.build_response_event(response_output_item_added)
+
+                # Start the content part of the event
+                response_content_part_added = ResponseContentPartAddedEvent(
+                    type="response.content_part.added",
+                    item_id=f"msg_{request_id}",
+                    sequence_number=sequence_number,
+                    output_index=output_index,
+                    content_index=content_index,
+                    part=ResponseOutputText(type="output_text", text="", annotations=[]),
+                )
+                sequence_number += 1
+                yield self.build_response_event(response_content_part_added)
+
+                # Stream the actual generated text
+                results = ""
+                for result in streamer:
+                    # Temporary hack for GPTOS 3: don't emit the final "<|return|>"
+                    if "gptoss" in model.config.architectures[0].lower():
+                        result = result.removesuffix("<|return|>")
+                    results += result
+
+                    # (related to temporary hack 2)
+                    if filter_cot:
+                        if cot_trace_end in results:  # end of reasoning trace observed -> stop filtering
+                            filter_cot = False
+                            results = ""  # reset the results -> results will now track the final response
+                            continue
+                        else:
+                            continue
+
+                    response_output_text_delta = ResponseTextDeltaEvent(
+                        type="response.output_text.delta",
+                        item_id=f"msg_{request_id}",
+                        sequence_number=sequence_number,
+                        output_index=output_index,
+                        content_index=content_index,
+                        delta=result,
+                        logprobs=[{"token": "", "logprob": 99.9}],  # TODO: add actual logprobs
+                    )
+                    sequence_number += 1
+                    yield self.build_response_event(response_output_text_delta)
+
+                # Signal the end of the text generation
+                response_output_text_done = ResponseTextDoneEvent(
+                    type="response.output_text.done",
+                    item_id=f"msg_{request_id}",
+                    sequence_number=sequence_number,
+                    output_index=output_index,
+                    content_index=0,
+                    text=results,
+                    logprobs=[{"token": "", "logprob": 99.9}],  # TODO: add actual logprobs
+                )
+                sequence_number += 1
+                yield self.build_response_event(response_output_text_done)
+
+                # Complete the content part
+                response_content_part_done = ResponseContentPartDoneEvent(
+                    type="response.content_part.done",
+                    item_id=f"msg_{request_id}",
+                    sequence_number=sequence_number,
+                    output_index=output_index,
+                    content_index=content_index,
+                    part=ResponseOutputText(type="output_text", text=response_output_text_done.text, annotations=[]),
+                )
+                sequence_number += 1
+                content_index += 1
+                yield self.build_response_event(response_content_part_done)
+
+                # Complete the output item
+                response_output_item_done = ResponseOutputItemDoneEvent(
+                    type="response.output_item.done",
+                    sequence_number=sequence_number,
+                    output_index=output_index,
+                    item=ResponseOutputMessage(
+                        id=f"msg_{request_id}",
+                        type="message",
+                        status="completed",
+                        role="assistant",
+                        content=[response_content_part_done.part],
+                        annotations=[],
+                    ),
+                )
+                sequence_number += 1
+                output_index += 1
+                yield self.build_response_event(response_output_item_done)
+
+                # Finally, Complete the event
+                response_completed = ResponseCompletedEvent(
+                    type="response.completed",
+                    sequence_number=sequence_number,
+                    response=Response(
+                        id=f"resp_{request_id}",
+                        created_at=created_at,
+                        status="completed",
+                        model=model_id_and_revision,
+                        instructions=req.get("instructions"),
+                        text={"format": {"type": "text"}},
+                        output=[response_output_item_done.item],
+                        object="response",
+                        tools=[],
+                        parallel_tool_calls=req.get("parallel_tool_calls", False),
+                        tool_choice="auto",
+                        metadata=req.get("metadata"),
+                    ),
+                )
+                sequence_number += 1
+                yield self.build_response_event(response_completed)
+
+                thread.join()
+            except Exception as e:
+                logger.error(f"Exception in response generation: {str(e)}")
+                error_event = ResponseErrorEvent(
+                    type="error",
+                    sequence_number=sequence_number,
+                    message=str(e),
+                )
+                sequence_number += 1
+                yield self.build_response_event(error_event)
+
+                response_failed = ResponseFailedEvent(
+                    type="response.failed",
+                    sequence_number=sequence_number,
+                    response=Response(
+                        id=f"resp_{request_id}",
+                        created_at=created_at,
+                        status="failed",
+                        model=model_id_and_revision,
+                        instructions=req.get("instructions"),
+                        text={"format": {"type": "text"}},
+                        output=[],
+                        object="response",
+                        tools=[],
+                        parallel_tool_calls=False,
+                        tool_choice="auto",
+                        metadata=req.get("metadata"),
+                        error=ResponseError(
+                            code="server_error",
+                            message=str(e),
+                        ),
+                    ),
+                )
+                sequence_number += 1
+                yield self.build_response_event(response_failed)
+
+            finally:
+                thread.join()
+
+        return stream_response(generation_streamer, request_id)
+
+    def generate_transcription(self, req: dict) -> Generator[str, None, None]:
+        """
+        Generates an OpenAI Transcription using the audio file.
+
+        Args:
+            req (`dict`): The request containing the audio file and model information.
+
+        Returns:
+            `Generator[str, None, None]`: A generator that yields the transcription result.
+        """
+        # TODO: implement streaming transcription (currently, it's not streaming)
+        if not is_librosa_available():
+            raise ImportError(
+                "Missing librosa dependency for audio transcription. Please install with `pip install librosa`"
+            )
+        model_id_and_revision = self.process_model_name(req["model"])
+        audio_model, audio_processor = self.load_audio_model_and_processor(model_id_and_revision)
+
+        generation_streamer = TextIteratorStreamer(
+            audio_processor.tokenizer, skip_special_tokens=True, skip_prompt=True
+        )
+        generation_config = create_generation_config_from_req(
+            req, model_generation_config=audio_model.generation_config
+        )
+
+        # Read the binary audio file using librosa
+        model_sampling_rate = audio_processor.feature_extractor.sampling_rate
+        audio_bytes = io.BytesIO(req["file"])
+        audio_array, _ = librosa.load(audio_bytes, sr=model_sampling_rate, mono=True)
+        audio_inputs = audio_processor(audio_array, sampling_rate=model_sampling_rate, return_tensors="pt").to(
+            audio_model.device
+        )
+        audio_inputs["input_features"] = audio_inputs["input_features"].to(audio_model.dtype)
+
+        generation_kwargs = {
+            "streamer": generation_streamer,
+            "generation_config": generation_config,
+            "return_dict_in_generate": True,
+        }
+
+        def _generate_transcription():
+            generated_ids = audio_model.generate(**audio_inputs, **generation_kwargs)
+            transcription_text = audio_processor.batch_decode(generated_ids.sequences, skip_special_tokens=True)[0]
+            transcription = Transcription(text=transcription_text)
+            yield f"{transcription.model_dump_json(exclude_none=True)}"
+
+        return _generate_transcription()
+
+    def is_continuation(self, req: dict) -> bool:
+        """
+        Determines whether the current request is a continuation of the last request. In other words, if it is the
+        same chat session.
+
+        Args:
+            req (`dict`): The request to check.
+
+        Returns:
+            `True` if the request is a continuation of the last request, `False` otherwise.
+        """
+        messages = req.get("messages") or req.get("input")  # ChatCompletion and Response have different fields
+        req_continues_last_messages = True
+
+        # No cached messages: this is a new request
+        if self.last_messages is None:
+            req_continues_last_messages = False
+        # The new request has no new rounds of conversation: this is a new request
+        elif len(self.last_messages) >= len(messages):
+            req_continues_last_messages = False
+        # Otherwise, check that the last messages are a subset of the new request
+        else:
+            for i in range(len(self.last_messages)):
+                if self.last_messages[i] != messages[i]:
+                    req_continues_last_messages = False
+                    break
+
+        self.last_messages = messages
+        return req_continues_last_messages
+
+    @staticmethod
+    def get_quantization_config(args: ServeArguments) -> Optional["BitsAndBytesConfig"]:
+        """
+        Returns the quantization config for the given CLI arguments.
+
+        Args:
+            args (`ServeArguments`): The serve arguments. May contain quantization settings, device, etc.
+
+        Returns:
+            `Optional[BitsAndBytesConfig]`: The quantization config.
+        """
+        if args.load_in_4bit:
+            quantization_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                # For consistency with model weights, we use the same value as `dtype`
+                bnb_4bit_compute_dtype=args.dtype,
+                bnb_4bit_quant_type=args.bnb_4bit_quant_type,
+                bnb_4bit_use_double_quant=args.use_bnb_nested_quant,
+                bnb_4bit_quant_storage=args.dtype,
+            )
+        elif args.load_in_8bit:
+            quantization_config = BitsAndBytesConfig(
+                load_in_8bit=True,
+            )
+        else:
+            quantization_config = None
+
+        return quantization_config
+
+    def process_model_name(self, model_id: str) -> str:
+        """
+        Applies the `force_model` CLI argument and canonicalizes the model name to the format "model_id@revision".
+        If the model_id DOESN'T contain an @, it defaults to "model_id@main".
+
+        Args:
+            model_id (`str`): The model ID.
+
+        Returns:
+            `str`: The canonicalized model name to be used
+        """
+        if self.args.force_model is not None:
+            model_id = self.args.force_model
+        if "@" in model_id:
+            return model_id
+        return f"{model_id}@main"
+
+    def _load_model_and_data_processor(self, model_id_and_revision: str):
+        """
+        Generic method to load a model and a data processor from a model ID and revision, making use of the serve CLI
+        arguments.
+
+        Args:
+            model_id_and_revision (`str`):
+                The model ID and revision to load.
+            model_cls (`type[PreTrainedModel]`):
+                The model class to load.
+
+        Returns:
+            `tuple[PreTrainedModel, Union[ProcessorMixin, PreTrainedTokenizerFast]]`: The loaded model and
+            data processor (tokenizer, audio processor, etc.).
+        """
+        args = self.args
+        logger.info(f"Loading {model_id_and_revision}")
+
+        if "@" in model_id_and_revision:
+            model_id, revision = model_id_and_revision.split("@", 1)
+        else:
+            model_id, revision = model_id_and_revision, "main"
+
+        data_processor = AutoProcessor.from_pretrained(
+            model_id,
+            revision=revision,
+            trust_remote_code=args.trust_remote_code,
+        )
+
+        dtype = args.dtype if args.dtype in ["auto", None] else getattr(torch, args.dtype)
+        quantization_config = self.get_quantization_config(args)
+
+        model_kwargs = {
+            "revision": revision,
+            "attn_implementation": args.attn_implementation,
+            "dtype": dtype,
+            "device_map": "auto",
+            "trust_remote_code": args.trust_remote_code,
+        }
+        if quantization_config is not None:
+            model_kwargs["quantization_config"] = quantization_config
+
+        config = AutoConfig.from_pretrained(model_id, **model_kwargs)
+        architecture = getattr(transformers, config.architectures[0])
+        model = architecture.from_pretrained(model_id, **model_kwargs)
+
+        if getattr(model, "hf_device_map", None) is None:
+            model = model.to(args.device)
+
+        has_default_max_length = (
+            model.generation_config.max_new_tokens is None and model.generation_config.max_length == 20
+        )
+        has_short_max_new_tokens = (
+            model.generation_config.max_new_tokens is not None and model.generation_config.max_new_tokens < 1024
+        )
+        if has_default_max_length or has_short_max_new_tokens:
+            model.generation_config.max_new_tokens = 1024
+
+        logger.info(f"Loaded model {model_id_and_revision}")
+        return model, data_processor
+
+    def load_model_and_processor(
+        self, model_id_and_revision: str
+    ) -> tuple["PreTrainedModel", PreTrainedTokenizerFast]:
+        """
+        Loads the text model and processor from the given model ID and revision into the ServeCommand instance.
+
+        Args:
+            model_id_and_revision (`str`):
+                The model ID and revision to load.
+
+        Returns:
+            `tuple[PreTrainedModel, PreTrainedTokenizerFast]`: The loaded text model and processor.
+        """
+        if model_id_and_revision not in self.loaded_models or self.loaded_models[model_id_and_revision].is_deleted():
+            model, processor = self._load_model_and_data_processor(model_id_and_revision)
+            self.loaded_models[model_id_and_revision] = TimedModel(
+                model,
+                timeout_seconds=self.args.model_timeout,
+                processor=processor,
+            )
+        else:
+            self.loaded_models[model_id_and_revision].reset_timer()
+            model = self.loaded_models[model_id_and_revision].model
+            processor = self.loaded_models[model_id_and_revision].processor
+
+        return model, processor
+
+    def load_audio_model_and_processor(self, model_id_and_revision: str) -> tuple["PreTrainedModel", ProcessorMixin]:
+        """
+        Loads the audio model and processor from the given model ID and revision into the ServeCommand instance.
+
+        Args:
+            model_id_and_revision (`str`):
+                The model ID and revision to load.
+
+        Returns:
+            `tuple[PreTrainedModel, ProcessorMixin]`: The loaded audio model and processor.
+        """
+        if model_id_and_revision not in self.loaded_models or self.loaded_models[model_id_and_revision].is_deleted():
+            audio_model, audio_processor = self._load_model_and_data_processor(model_id_and_revision)
+            self.loaded_models[model_id_and_revision] = TimedModel(
+                audio_model,
+                timeout_seconds=self.args.model_timeout,
+                processor=audio_processor,
+            )
+        else:
+            self.loaded_models[model_id_and_revision].reset_timer()
+            audio_model = self.loaded_models[model_id_and_revision].model
+            audio_processor = self.loaded_models[model_id_and_revision].processor
+
+        return audio_model, audio_processor
+
+
+if __name__ == "__main__":
+    serve = ServeCommand()
+    serve.run()
diff --git a/venv/lib/python3.13/site-packages/transformers/commands/train.py b/venv/lib/python3.13/site-packages/transformers/commands/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..06e95443df24c87f86b4e2dc96d811dee515e267
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/commands/train.py
@@ -0,0 +1,158 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from argparse import ArgumentParser, Namespace
+
+from ..data import SingleSentenceClassificationProcessor as Processor
+from ..pipelines import TextClassificationPipeline
+from ..utils import is_tf_available, is_torch_available, logging
+from . import BaseTransformersCLICommand
+
+
+if not is_tf_available() and not is_torch_available():
+    raise RuntimeError("At least one of PyTorch or TensorFlow 2.0+ should be installed to use CLI training")
+
+# TF training parameters
+USE_XLA = False
+USE_AMP = False
+
+
+def train_command_factory(args: Namespace):
+    """
+    Factory function used to instantiate training command from provided command line arguments.
+
+    Returns: TrainCommand
+    """
+    return TrainCommand(args)
+
+
+class TrainCommand(BaseTransformersCLICommand):
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        """
+        Register this command to argparse so it's available for the transformer-cli
+
+        Args:
+            parser: Root parser to register command-specific arguments
+        """
+        train_parser = parser.add_parser("train", help="CLI tool to train a model on a task.")
+
+        train_parser.add_argument(
+            "--train_data",
+            type=str,
+            required=True,
+            help="path to train (and optionally evaluation) dataset as a csv with tab separated labels and sentences.",
+        )
+        train_parser.add_argument(
+            "--column_label", type=int, default=0, help="Column of the dataset csv file with example labels."
+        )
+        train_parser.add_argument(
+            "--column_text", type=int, default=1, help="Column of the dataset csv file with example texts."
+        )
+        train_parser.add_argument(
+            "--column_id", type=int, default=2, help="Column of the dataset csv file with example ids."
+        )
+        train_parser.add_argument(
+            "--skip_first_row", action="store_true", help="Skip the first row of the csv file (headers)."
+        )
+
+        train_parser.add_argument("--validation_data", type=str, default="", help="path to validation dataset.")
+        train_parser.add_argument(
+            "--validation_split",
+            type=float,
+            default=0.1,
+            help="if validation dataset is not provided, fraction of train dataset to use as validation dataset.",
+        )
+
+        train_parser.add_argument("--output", type=str, default="./", help="path to saved the trained model.")
+
+        train_parser.add_argument(
+            "--task", type=str, default="text_classification", help="Task to train the model on."
+        )
+        train_parser.add_argument(
+            "--model", type=str, default="google-bert/bert-base-uncased", help="Model's name or path to stored model."
+        )
+        train_parser.add_argument("--train_batch_size", type=int, default=32, help="Batch size for training.")
+        train_parser.add_argument("--valid_batch_size", type=int, default=64, help="Batch size for validation.")
+        train_parser.add_argument("--learning_rate", type=float, default=3e-5, help="Learning rate.")
+        train_parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon for Adam optimizer.")
+        train_parser.set_defaults(func=train_command_factory)
+
+    def __init__(self, args: Namespace):
+        self.logger = logging.get_logger("transformers/training")
+
+        self.framework = "tf" if is_tf_available() else "torch"
+
+        os.makedirs(args.output, exist_ok=True)
+        self.output = args.output
+
+        self.column_label = args.column_label
+        self.column_text = args.column_text
+        self.column_id = args.column_id
+
+        self.logger.info(f"Loading {args.task} pipeline for {args.model}")
+        if args.task == "text_classification":
+            self.pipeline = TextClassificationPipeline.from_pretrained(args.model)
+        elif args.task == "token_classification":
+            raise NotImplementedError
+        elif args.task == "question_answering":
+            raise NotImplementedError
+
+        self.logger.info(f"Loading dataset from {args.train_data}")
+        self.train_dataset = Processor.create_from_csv(
+            args.train_data,
+            column_label=args.column_label,
+            column_text=args.column_text,
+            column_id=args.column_id,
+            skip_first_row=args.skip_first_row,
+        )
+        self.valid_dataset = None
+        if args.validation_data:
+            self.logger.info(f"Loading validation dataset from {args.validation_data}")
+            self.valid_dataset = Processor.create_from_csv(
+                args.validation_data,
+                column_label=args.column_label,
+                column_text=args.column_text,
+                column_id=args.column_id,
+                skip_first_row=args.skip_first_row,
+            )
+
+        self.validation_split = args.validation_split
+        self.train_batch_size = args.train_batch_size
+        self.valid_batch_size = args.valid_batch_size
+        self.learning_rate = args.learning_rate
+        self.adam_epsilon = args.adam_epsilon
+
+    def run(self):
+        if self.framework == "tf":
+            return self.run_tf()
+        return self.run_torch()
+
+    def run_torch(self):
+        raise NotImplementedError
+
+    def run_tf(self):
+        self.pipeline.fit(
+            self.train_dataset,
+            validation_data=self.valid_dataset,
+            validation_split=self.validation_split,
+            learning_rate=self.learning_rate,
+            adam_epsilon=self.adam_epsilon,
+            train_batch_size=self.train_batch_size,
+            valid_batch_size=self.valid_batch_size,
+        )
+
+        # Save trained pipeline
+        self.pipeline.save_pretrained(self.output)
diff --git a/venv/lib/python3.13/site-packages/transformers/commands/transformers_cli.py b/venv/lib/python3.13/site-packages/transformers/commands/transformers_cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..00eaff01a4efbeb6f9deaad147ab22669dc112f9
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/commands/transformers_cli.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+
+from transformers import HfArgumentParser
+from transformers.commands.add_fast_image_processor import AddFastImageProcessorCommand
+from transformers.commands.add_new_model_like import AddNewModelLikeCommand
+from transformers.commands.chat import ChatCommand
+from transformers.commands.convert import ConvertCommand
+from transformers.commands.download import DownloadCommand
+from transformers.commands.env import EnvironmentCommand
+from transformers.commands.run import RunCommand
+from transformers.commands.serving import ServeCommand
+
+
+def main_cli():
+    warnings.warn(
+        "`transformers-cli` is deprecated in favour of `transformers` directly and will be removed in v5.",
+        DeprecationWarning,
+    )
+    main()
+
+
+def main():
+    parser = HfArgumentParser(prog="Transformers CLI tool", usage="transformers <command> [<args>]")
+    commands_parser = parser.add_subparsers(help="transformers command helpers")
+
+    # Register commands
+    ChatCommand.register_subcommand(commands_parser)
+    ConvertCommand.register_subcommand(commands_parser)
+    DownloadCommand.register_subcommand(commands_parser)
+    EnvironmentCommand.register_subcommand(commands_parser)
+    RunCommand.register_subcommand(commands_parser)
+    ServeCommand.register_subcommand(commands_parser)
+    AddNewModelLikeCommand.register_subcommand(commands_parser)
+    AddFastImageProcessorCommand.register_subcommand(commands_parser)
+
+    # Let's go
+    args = parser.parse_args()
+
+    if not hasattr(args, "func"):
+        parser.print_help()
+        exit(1)
+
+    # Run
+    service = args.func(args)
+    service.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/venv/lib/python3.13/site-packages/transformers/data/__init__.py b/venv/lib/python3.13/site-packages/transformers/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e06b7ca3901a8140ab7b92fd79b259aa61d3aa0
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/data/__init__.py
@@ -0,0 +1,46 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .data_collator import (
+    DataCollatorForLanguageModeling,
+    DataCollatorForMultipleChoice,
+    DataCollatorForPermutationLanguageModeling,
+    DataCollatorForSeq2Seq,
+    DataCollatorForSOP,
+    DataCollatorForTokenClassification,
+    DataCollatorForWholeWordMask,
+    DataCollatorWithFlattening,
+    DataCollatorWithPadding,
+    DefaultDataCollator,
+    default_data_collator,
+)
+from .metrics import glue_compute_metrics, xnli_compute_metrics
+from .processors import (
+    DataProcessor,
+    InputExample,
+    InputFeatures,
+    SingleSentenceClassificationProcessor,
+    SquadExample,
+    SquadFeatures,
+    SquadV1Processor,
+    SquadV2Processor,
+    glue_convert_examples_to_features,
+    glue_output_modes,
+    glue_processors,
+    glue_tasks_num_labels,
+    squad_convert_examples_to_features,
+    xnli_output_modes,
+    xnli_processors,
+    xnli_tasks_num_labels,
+)
diff --git a/venv/lib/python3.13/site-packages/transformers/data/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/data/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..258b1887be8219cd4b106a7821c26f7d1ed11386
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/data/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/data/data_collator.py b/venv/lib/python3.13/site-packages/transformers/data/data_collator.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fa9cb72de9f38fc7b54a208cf3f14b706df9b7f
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/data/data_collator.py
@@ -0,0 +1,2182 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import multiprocessing as mp
+import random
+import warnings
+from collections.abc import Mapping
+from dataclasses import dataclass
+from random import randint
+from typing import Any, Callable, Optional, Union
+
+import numpy as np
+
+from ..tokenization_utils_base import PreTrainedTokenizerBase
+from ..utils import PaddingStrategy
+
+
+InputDataClass = Any
+
+"""
+A DataCollator is a function that takes a list of samples from a Dataset and collate them into a batch, as a dictionary
+of PyTorch/TensorFlow tensors or NumPy arrays.
+"""
+DataCollator = Callable[[list[InputDataClass]], dict[str, Any]]
+
+
+class DataCollatorMixin:
+    def __call__(self, features, return_tensors: Optional[str] = None):
+        if return_tensors is None:
+            return_tensors = self.return_tensors
+        if return_tensors == "tf":
+            return self.tf_call(features)
+        elif return_tensors == "pt":
+            return self.torch_call(features)
+        elif return_tensors == "np":
+            return self.numpy_call(features)
+        else:
+            raise ValueError(f"Framework '{return_tensors}' not recognized!")
+
+
+def pad_without_fast_tokenizer_warning(tokenizer, *pad_args, **pad_kwargs):
+    """
+    Pads without triggering the warning about how using the pad function is sub-optimal when using a fast tokenizer.
+    """
+
+    # To avoid errors when using Feature extractors
+    if not hasattr(tokenizer, "deprecation_warnings"):
+        return tokenizer.pad(*pad_args, **pad_kwargs)
+
+    # Save the state of the warning, then disable it
+    warning_state = tokenizer.deprecation_warnings.get("Asking-to-pad-a-fast-tokenizer", False)
+    tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
+
+    try:
+        padded = tokenizer.pad(*pad_args, **pad_kwargs)
+    finally:
+        # Restore the state of the warning.
+        tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = warning_state
+
+    return padded
+
+
+def default_data_collator(features: list[InputDataClass], return_tensors="pt") -> dict[str, Any]:
+    """
+    Very simple data collator that simply collates batches of dict-like objects and performs special handling for
+    potential keys named:
+
+        - `label`: handles a single value (int or float) per object
+        - `label_ids`: handles a list of values per object
+
+    Does not do any additional preprocessing: property names of the input object will be used as corresponding inputs
+    to the model. See glue and ner for example of how it's useful.
+    """
+
+    # In this function we'll make the assumption that all `features` in the batch
+    # have the same attributes.
+    # So we will look at the first element as a proxy for what attributes exist
+    # on the whole batch.
+
+    if return_tensors == "pt":
+        return torch_default_data_collator(features)
+    elif return_tensors == "tf":
+        return tf_default_data_collator(features)
+    elif return_tensors == "np":
+        return numpy_default_data_collator(features)
+
+
+@dataclass
+class DefaultDataCollator(DataCollatorMixin):
+    """
+    Very simple data collator that simply collates batches of dict-like objects and performs special handling for
+    potential keys named:
+
+        - `label`: handles a single value (int or float) per object
+        - `label_ids`: handles a list of values per object
+
+    Does not do any additional preprocessing: property names of the input object will be used as corresponding inputs
+    to the model. See glue and ner for example of how it's useful.
+
+    This is an object (like other data collators) rather than a pure function like default_data_collator. This can be
+    helpful if you need to set a return_tensors value at initialization.
+
+    Args:
+        return_tensors (`str`, *optional*, defaults to `"pt"`):
+            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
+    """
+
+    return_tensors: str = "pt"
+
+    def __call__(self, features: list[dict[str, Any]], return_tensors=None) -> dict[str, Any]:
+        if return_tensors is None:
+            return_tensors = self.return_tensors
+        return default_data_collator(features, return_tensors)
+
+
+def torch_default_data_collator(features: list[InputDataClass]) -> dict[str, Any]:
+    import torch
+
+    if not isinstance(features[0], Mapping):
+        features = [vars(f) for f in features]
+    first = features[0]
+    batch = {}
+
+    # Special handling for labels.
+    # Ensure that tensor is created with the correct type
+    # (it should be automatically the case, but let's make sure of it.)
+    if "label" in first and first["label"] is not None:
+        label = first["label"].item() if isinstance(first["label"], torch.Tensor) else first["label"]
+        dtype = torch.long if isinstance(label, int) else torch.float
+        batch["labels"] = torch.tensor([f["label"] for f in features], dtype=dtype)
+    elif "label_ids" in first and first["label_ids"] is not None:
+        if isinstance(first["label_ids"], torch.Tensor):
+            batch["labels"] = torch.stack([f["label_ids"] for f in features])
+        else:
+            dtype = torch.long if isinstance(first["label_ids"][0], int) else torch.float
+            batch["labels"] = torch.tensor([f["label_ids"] for f in features], dtype=dtype)
+
+    # Handling of all other possible keys.
+    # Again, we will use the first element to figure out which key/values are not None for this model.
+    for k, v in first.items():
+        if k not in ("label", "label_ids") and v is not None and not isinstance(v, str):
+            if isinstance(v, torch.Tensor):
+                batch[k] = torch.stack([f[k] for f in features])
+            elif isinstance(v, np.ndarray):
+                batch[k] = torch.from_numpy(np.stack([f[k] for f in features]))
+            else:
+                batch[k] = torch.tensor([f[k] for f in features])
+
+    return batch
+
+
+def tf_default_data_collator(features: list[InputDataClass]) -> dict[str, Any]:
+    import tensorflow as tf
+
+    if not isinstance(features[0], Mapping):
+        features = [vars(f) for f in features]
+    first = features[0]
+    batch = {}
+
+    # Special handling for labels.
+    # Ensure that tensor is created with the correct type
+    # (it should be automatically the case, but let's make sure of it.)
+    if "label" in first and first["label"] is not None:
+        label_col_name = "label"
+    elif "label_ids" in first and first["label_ids"] is not None:
+        label_col_name = "label_ids"
+    elif "labels" in first and first["labels"] is not None:
+        label_col_name = "labels"
+    else:
+        label_col_name = None
+    if label_col_name is not None:
+        if isinstance(first[label_col_name], tf.Tensor):
+            dtype = tf.int64 if first[label_col_name].dtype.is_integer else tf.float32
+        elif isinstance(first[label_col_name], (np.ndarray, np.generic)):
+            dtype = tf.int64 if np.issubdtype(first[label_col_name].dtype, np.integer) else tf.float32
+        elif isinstance(first[label_col_name], (tuple, list)):
+            dtype = tf.int64 if isinstance(first[label_col_name][0], int) else tf.float32
+        else:
+            dtype = tf.int64 if isinstance(first[label_col_name], int) else tf.float32
+        batch["labels"] = tf.convert_to_tensor([f[label_col_name] for f in features], dtype=dtype)
+    # Handling of all other possible keys.
+    # Again, we will use the first element to figure out which key/values are not None for this model.
+    for k, v in first.items():
+        if k not in ("label", "label_ids", "labels") and v is not None and not isinstance(v, str):
+            if isinstance(v, (tf.Tensor, np.ndarray)):
+                batch[k] = tf.stack([f[k] for f in features])
+            else:
+                batch[k] = tf.convert_to_tensor([f[k] for f in features])
+
+    return batch
+
+
+def numpy_default_data_collator(features: list[InputDataClass]) -> dict[str, Any]:
+    if not isinstance(features[0], Mapping):
+        features = [vars(f) for f in features]
+    first = features[0]
+    batch = {}
+
+    # Special handling for labels.
+    # Ensure that tensor is created with the correct type
+    # (it should be automatically the case, but let's make sure of it.)
+    if "label" in first and first["label"] is not None:
+        label = first["label"].item() if isinstance(first["label"], np.ndarray) else first["label"]
+        dtype = np.int64 if isinstance(label, int) else np.float32
+        batch["labels"] = np.array([f["label"] for f in features], dtype=dtype)
+    elif "label_ids" in first and first["label_ids"] is not None:
+        if isinstance(first["label_ids"], np.ndarray):
+            batch["labels"] = np.stack([f["label_ids"] for f in features])
+        else:
+            dtype = np.int64 if isinstance(first["label_ids"][0], int) else np.float32
+            batch["labels"] = np.array([f["label_ids"] for f in features], dtype=dtype)
+
+    # Handling of all other possible keys.
+    # Again, we will use the first element to figure out which key/values are not None for this model.
+    for k, v in first.items():
+        if k not in ("label", "label_ids") and v is not None and not isinstance(v, str):
+            if isinstance(v, np.ndarray):
+                batch[k] = np.stack([f[k] for f in features])
+            else:
+                batch[k] = np.array([f[k] for f in features])
+
+    return batch
+
+
+@dataclass
+class DataCollatorWithPadding:
+    """
+    Data collator that will dynamically pad the inputs received.
+
+    Args:
+        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
+            The tokenizer used for encoding the data.
+        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+
+            - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
+              sequence is provided).
+            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+              acceptable input length for the model if that argument is not provided.
+            - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths).
+        max_length (`int`, *optional*):
+            Maximum length of the returned list and optionally padding length (see above).
+        pad_to_multiple_of (`int`, *optional*):
+            If set will pad the sequence to a multiple of the provided value.
+
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.0 (Volta).
+        return_tensors (`str`, *optional*, defaults to `"pt"`):
+            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
+    """
+
+    tokenizer: PreTrainedTokenizerBase
+    padding: Union[bool, str, PaddingStrategy] = True
+    max_length: Optional[int] = None
+    pad_to_multiple_of: Optional[int] = None
+    return_tensors: str = "pt"
+
+    def __call__(self, features: list[dict[str, Any]]) -> dict[str, Any]:
+        batch = pad_without_fast_tokenizer_warning(
+            self.tokenizer,
+            features,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors=self.return_tensors,
+        )
+        if "label" in batch:
+            batch["labels"] = batch["label"]
+            del batch["label"]
+        if "label_ids" in batch:
+            batch["labels"] = batch["label_ids"]
+            del batch["label_ids"]
+        return batch
+
+
+@dataclass
+class DataCollatorForTokenClassification(DataCollatorMixin):
+    """
+    Data collator that will dynamically pad the inputs received, as well as the labels.
+
+    Args:
+        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
+            The tokenizer used for encoding the data.
+        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+
+            - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
+              sequence is provided).
+            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+              acceptable input length for the model if that argument is not provided.
+            - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths).
+        max_length (`int`, *optional*):
+            Maximum length of the returned list and optionally padding length (see above).
+        pad_to_multiple_of (`int`, *optional*):
+            If set will pad the sequence to a multiple of the provided value.
+
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.0 (Volta).
+        label_pad_token_id (`int`, *optional*, defaults to -100):
+            The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions).
+        return_tensors (`str`, *optional*, defaults to `"pt"`):
+            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
+    """
+
+    tokenizer: PreTrainedTokenizerBase
+    padding: Union[bool, str, PaddingStrategy] = True
+    max_length: Optional[int] = None
+    pad_to_multiple_of: Optional[int] = None
+    label_pad_token_id: int = -100
+    return_tensors: str = "pt"
+
+    def torch_call(self, features):
+        import torch
+
+        label_name = "label" if "label" in features[0] else "labels"
+        labels = [feature[label_name] for feature in features] if label_name in features[0] else None
+
+        no_labels_features = [{k: v for k, v in feature.items() if k != label_name} for feature in features]
+
+        batch = pad_without_fast_tokenizer_warning(
+            self.tokenizer,
+            no_labels_features,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors="pt",
+        )
+
+        if labels is None:
+            return batch
+
+        sequence_length = batch["input_ids"].shape[1]
+        padding_side = self.tokenizer.padding_side
+
+        def to_list(tensor_or_iterable):
+            if isinstance(tensor_or_iterable, torch.Tensor):
+                return tensor_or_iterable.tolist()
+            return list(tensor_or_iterable)
+
+        if padding_side == "right":
+            batch[label_name] = [
+                to_list(label) + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels
+            ]
+        else:
+            batch[label_name] = [
+                [self.label_pad_token_id] * (sequence_length - len(label)) + to_list(label) for label in labels
+            ]
+
+        batch[label_name] = torch.tensor(batch[label_name], dtype=torch.int64)
+        return batch
+
+    def tf_call(self, features):
+        import tensorflow as tf
+
+        label_name = "label" if "label" in features[0] else "labels"
+        labels = [feature[label_name] for feature in features] if label_name in features[0] else None
+        batch = pad_without_fast_tokenizer_warning(
+            self.tokenizer,
+            features,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            # Conversion to tensors will fail if we have labels as they are not of the same length yet.
+            return_tensors="tf" if labels is None else None,
+        )
+
+        if labels is None:
+            return batch
+
+        sequence_length = tf.convert_to_tensor(batch["input_ids"]).shape[1]
+        padding_side = self.tokenizer.padding_side
+        if padding_side == "right":
+            batch["labels"] = [
+                list(label) + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels
+            ]
+        else:
+            batch["labels"] = [
+                [self.label_pad_token_id] * (sequence_length - len(label)) + list(label) for label in labels
+            ]
+
+        batch = {k: tf.convert_to_tensor(v, dtype=tf.int64) for k, v in batch.items()}
+        return batch
+
+    def numpy_call(self, features):
+        label_name = "label" if "label" in features[0] else "labels"
+        labels = [feature[label_name] for feature in features] if label_name in features[0] else None
+        batch = pad_without_fast_tokenizer_warning(
+            self.tokenizer,
+            features,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            # Conversion to tensors will fail if we have labels as they are not of the same length yet.
+            return_tensors="np" if labels is None else None,
+        )
+
+        if labels is None:
+            return batch
+
+        sequence_length = np.array(batch["input_ids"]).shape[1]
+        padding_side = self.tokenizer.padding_side
+        if padding_side == "right":
+            batch["labels"] = [
+                list(label) + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels
+            ]
+        else:
+            batch["labels"] = [
+                [self.label_pad_token_id] * (sequence_length - len(label)) + list(label) for label in labels
+            ]
+
+        batch = {k: np.array(v, dtype=np.int64) for k, v in batch.items()}
+        return batch
+
+
+def _torch_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
+    """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
+    import torch
+
+    # Tensorize if necessary.
+    if isinstance(examples[0], (list, tuple, np.ndarray)):
+        examples = [torch.tensor(e, dtype=torch.long) for e in examples]
+
+    length_of_first = examples[0].size(0)
+
+    # Check if padding is necessary.
+
+    are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
+    if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0):
+        if not isinstance(examples, torch.Tensor):
+            return torch.stack(examples, dim=0)
+
+    # If yes, check if we have a `pad_token`.
+    if tokenizer.pad_token is None:
+        raise ValueError(
+            "You are attempting to pad samples but the tokenizer you are using"
+            f" ({tokenizer.__class__.__name__}) does not have a pad token."
+        )
+
+    # Creating the full tensor and filling it with our data.
+    max_length = max(x.size(0) for x in examples)
+    if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+        max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+    result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id)
+    for i, example in enumerate(examples):
+        if tokenizer.padding_side == "right":
+            result[i, : example.shape[0]] = example
+        else:
+            result[i, -example.shape[0] :] = example
+    return result
+
+
+def _tf_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
+    import tensorflow as tf
+
+    """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
+    # Tensorize if necessary.
+    if isinstance(examples[0], (list, tuple)):
+        examples = [tf.convert_to_tensor(e, dtype=tf.int64) for e in examples]
+
+    # Check if padding is necessary.
+    length_of_first = len(examples[0])
+    are_tensors_same_length = all(len(x) == length_of_first for x in examples)
+    if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0):
+        return tf.stack(examples, axis=0)
+
+    # If yes, check if we have a `pad_token`.
+    if tokenizer.pad_token is None:
+        raise ValueError(
+            "You are attempting to pad samples but the tokenizer you are using"
+            f" ({tokenizer.__class__.__name__}) does not have a pad token."
+        )
+
+    # Creating the full tensor and filling it with our data.
+    max_length = max(len(x) for x in examples)
+    if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+        max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+    # result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id)
+    result = []
+    rank = tf.rank(examples[0])
+    paddings = np.zeros((rank, 2), dtype=np.int32)
+    for example in examples:
+        if tokenizer.padding_side == "right":
+            paddings[0, 1] = max_length - len(example)
+        else:
+            paddings[0, 0] = max_length - len(example)
+        result.append(tf.pad(example, paddings, constant_values=tokenizer.pad_token_id))
+    return tf.stack(result, axis=0)
+
+
+def _numpy_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
+    """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
+    # Tensorize if necessary.
+    if isinstance(examples[0], (list, tuple)):
+        examples = [np.array(e, dtype=np.int64) for e in examples]
+
+    # Check if padding is necessary.
+    length_of_first = len(examples[0])
+    are_tensors_same_length = all(len(x) == length_of_first for x in examples)
+    if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0):
+        return np.stack(examples, axis=0)
+
+    # If yes, check if we have a `pad_token`.
+    if tokenizer.pad_token is None:
+        raise ValueError(
+            "You are attempting to pad samples but the tokenizer you are using"
+            f" ({tokenizer.__class__.__name__}) does not have a pad token."
+        )
+
+    # Creating the full tensor and filling it with our data.
+    max_length = max(len(x) for x in examples)
+    if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+        max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+    result = np.full(shape=(len(examples), max_length), fill_value=tokenizer.pad_token_id, dtype=examples[0].dtype)
+    for i, example in enumerate(examples):
+        if tokenizer.padding_side == "right":
+            result[i, : example.shape[0]] = example
+        else:
+            result[i, -example.shape[0] :] = example
+    return result
+
+
+@dataclass
+class DataCollatorForMultipleChoice(DataCollatorMixin):
+    """
+    Data collator that dynamically pads a batch of nested examples for multiple choice, so that all choices
+    of all examples have the same length.
+
+    Args:
+        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
+            The tokenizer used for encoding the data.
+        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+            Select a strategy to pad the returned sequences according to the model's padding side and padding index
+            among:
+
+            - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence
+              is provided).
+            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+              acceptable input length for the model if that argument is not provided.
+            - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+              lengths).
+        max_length (`int`, *optional*):
+            Maximum length of the returned list and optionally padding length (see above).
+        pad_to_multiple_of (`int`, *optional*):
+            Pad the sequence to a multiple of the provided value.
+
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.5 (Volta).
+        return_tensors (`str`, *optional*, defaults to `"pt"`):
+            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
+    """
+
+    tokenizer: PreTrainedTokenizerBase
+    padding: Union[bool, str, PaddingStrategy] = True
+    max_length: Optional[int] = None
+    pad_to_multiple_of: Optional[int] = None
+    return_tensors: str = "pt"
+
+    def torch_call(self, examples: list[dict[str, Any]]):  # Refactored implementation from the docs.
+        import torch
+
+        # Take labels out of the examples beforehand, because they aren't nested.
+        label_name = "label" if "label" in examples[0] else "labels"
+        labels = [example.pop(label_name) for example in examples]
+
+        batch_size = len(examples)
+        num_choices = len(examples[0]["input_ids"])
+
+        # Go from e.g. 2 examples of 2 choices [{input_ids: [[1], [2]]}, {input_ids: [[3], [4]]}]
+        # to 4 examples [{input_ids: [1]}, {input_ids: [2]}] + [{input_ids: [3]}, {input_ids: [4]}]
+        flat_examples = sum(
+            ([{k: v[i] for k, v in example.items()} for i in range(num_choices)] for example in examples), start=[]
+        )
+
+        # Pad all choices of all examples as if you're padding any other batch of examples.
+        batch = self.tokenizer.pad(
+            flat_examples,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors="pt",
+        )
+
+        # Reshape from B*C x L into B x C x L, and add the labels back in.
+        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
+        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
+        return batch
+
+    def tf_call(self, features):  # Implementation taken from the docs.
+        import tensorflow as tf
+
+        label_name = "label" if "label" in features[0] else "labels"
+        labels = [feature.pop(label_name) for feature in features]
+        batch_size = len(features)
+        num_choices = len(features[0]["input_ids"])
+        flattened_features = [
+            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
+        ]
+        flattened_features = sum(flattened_features, [])  # Sometimes written as list(chain(*flattened_features))
+
+        batch = self.tokenizer.pad(
+            flattened_features,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors="tf",
+        )
+
+        batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()}
+        batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64)
+        return batch
+
+
+@dataclass
+class DataCollatorForSeq2Seq:
+    """
+    Data collator that will dynamically pad the inputs received, as well as the labels.
+
+    Args:
+        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
+            The tokenizer used for encoding the data.
+        model ([`PreTrainedModel`], *optional*):
+            The model that is being trained. If set and has the *prepare_decoder_input_ids_from_labels*, use it to
+            prepare the *decoder_input_ids*
+
+            This is useful when using *label_smoothing* to avoid calculating loss twice.
+        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+
+            - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
+              sequence is provided).
+            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+              acceptable input length for the model if that argument is not provided.
+            - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths).
+        max_length (`int`, *optional*):
+            Maximum length of the returned list and optionally padding length (see above).
+        pad_to_multiple_of (`int`, *optional*):
+            If set will pad the sequence to a multiple of the provided value.
+
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.0 (Volta).
+        label_pad_token_id (`int`, *optional*, defaults to -100):
+            The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).
+        return_tensors (`str`, *optional*, defaults to `"pt"`):
+            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
+    """
+
+    tokenizer: PreTrainedTokenizerBase
+    model: Optional[Any] = None
+    padding: Union[bool, str, PaddingStrategy] = True
+    max_length: Optional[int] = None
+    pad_to_multiple_of: Optional[int] = None
+    label_pad_token_id: int = -100
+    return_tensors: str = "pt"
+
+    def __call__(self, features, return_tensors=None):
+        if return_tensors is None:
+            return_tensors = self.return_tensors
+
+        label_name = "label" if "label" in features[0] else "labels"
+        labels = [feature[label_name] for feature in features] if label_name in features[0] else None
+        # reconvert list[None] to None if necessary
+        # this might occur when we pass {..., "labels": None}
+        if labels is not None and all(label is None for label in labels):
+            labels = None
+        non_labels_features = [{k: v for k, v in feature.items() if k != label_name} for feature in features]
+
+        # run through tokenizer without labels to ensure no side effects
+        batch = pad_without_fast_tokenizer_warning(
+            self.tokenizer,
+            non_labels_features,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors=return_tensors,
+        )
+
+        # we have to pad the labels manually as we cannot rely on `tokenizer.pad` and we need them to be of the same length to return tensors
+        no_padding = self.padding is False or self.padding == PaddingStrategy.DO_NOT_PAD
+        if labels is not None:
+            if no_padding:
+                if isinstance(features[0][label_name], list):
+                    batch["labels"] = list(labels)
+                else:
+                    batch["labels"] = [np.concatenate([label, []]) for label in labels]
+            else:
+                max_padding = self.padding == PaddingStrategy.MAX_LENGTH and self.max_length is not None
+                max_label_length = max(len(l) for l in labels) if not max_padding else self.max_length
+                if self.pad_to_multiple_of is not None:
+                    max_label_length = (
+                        (max_label_length + self.pad_to_multiple_of - 1)
+                        // self.pad_to_multiple_of
+                        * self.pad_to_multiple_of
+                    )
+
+                padding_side = self.tokenizer.padding_side
+                if isinstance(features[0][label_name], list):
+                    batch["labels"] = [
+                        label + [self.label_pad_token_id] * (max_label_length - len(label))
+                        if padding_side == "right"
+                        else [self.label_pad_token_id] * (max_label_length - len(label)) + label
+                        for label in labels
+                    ]
+                else:
+                    batch["labels"] = [
+                        np.concatenate(
+                            [
+                                label,
+                                np.array([self.label_pad_token_id] * (max_label_length - len(label)), dtype=np.int64),
+                            ]
+                        )
+                        if padding_side == "right"
+                        else np.concatenate(
+                            [
+                                np.array([self.label_pad_token_id] * (max_label_length - len(label)), dtype=np.int64),
+                                label,
+                            ]
+                        )
+                        for label in labels
+                    ]
+
+        # reintroduce side effects via tokenizer that return respective datatypes for the `return_tensors` argument
+        if batch.get("labels", None) is not None:
+            if return_tensors == "pt":
+                import torch
+
+                batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
+            elif return_tensors == "tf":
+                import tensorflow as tf
+
+                batch["labels"] = tf.constant(batch["labels"], dtype=tf.int64)
+            else:
+                batch["labels"] = np.array(batch["labels"], dtype=np.int64)
+        else:
+            batch["labels"] = None
+
+        # prepare decoder_input_ids
+        if (
+            labels is not None
+            and self.model is not None
+            and hasattr(self.model, "prepare_decoder_input_ids_from_labels")
+        ):
+            decoder_input_ids = self.model.prepare_decoder_input_ids_from_labels(labels=batch["labels"])
+            batch["decoder_input_ids"] = decoder_input_ids
+
+        return batch
+
+
+@dataclass
+class DataCollatorForLanguageModeling(DataCollatorMixin):
+    """
+    Data collator used for language modeling. Inputs are dynamically padded to the maximum length of a batch if they
+    are not all of the same length.
+
+    Args:
+        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
+            The tokenizer used for encoding the data.
+        mlm (`bool`, *optional*, defaults to `True`):
+            Whether or not to use masked language modeling. If set to `False`, the labels are the same as the inputs
+            with the padding tokens ignored (by setting them to -100). Otherwise, the labels are -100 for non-masked
+            tokens and the value to predict for the masked token.
+        whole_word_mask (`bool`, *optional*, defaults to `False`):
+            Whether or not to mask whole words instead of individual tokens.
+        mlm_probability (`float`, *optional*, defaults to 0.15):
+            The probability with which to (randomly) mask tokens in the input, when `mlm` is set to `True`.
+        mask_replace_prob (`float`, *optional*, defaults to 0.8):
+            The probability with which masked tokens are replaced by the tokenizer's mask token (e.g., `[MASK]`).
+            Defaults to 0.8, meaning 80% of the masked tokens will be replaced with `[MASK]`.
+            Only works when `mlm` is set to `True`.
+        random_replace_prob (`float`, *optional*, defaults to 0.1):
+            The probability with which masked tokens are replaced by random tokens from the tokenizer's vocabulary.
+            Defaults to 0.1, meaning 10% of the masked tokens will be replaced with random tokens. The remaining
+            masked tokens (1 - mask_replace_prob - random_replace_prob) are left unchanged.
+            Only works when `mlm` is set to `True`.
+        pad_to_multiple_of (`int`, *optional*):
+            If set, will pad the sequence to a multiple of the provided value.
+        return_tensors (`str`):
+            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
+        seed (`int`, *optional*):
+            The seed to use for the random number generator for masking. If not provided, the global RNG will be used.
+
+    <Tip>
+
+    For best performance, this data collator should be used with a dataset having items that are dictionaries or
+    BatchEncoding, with the `"special_tokens_mask"` key, as returned by a [`PreTrainedTokenizer`] or a
+    [`PreTrainedTokenizerFast`] with the argument `return_special_tokens_mask=True`.
+
+    <Example Options and Expectations>
+
+    1. Default Behavior:
+        - `mask_replace_prob=0.8`, `random_replace_prob=0.1`.
+        - Expect 80% of masked tokens replaced with `[MASK]`, 10% replaced with random tokens, and 10% left unchanged.
+
+    2. All masked tokens replaced by `[MASK]`:
+        - `mask_replace_prob=1.0`, `random_replace_prob=0.0`.
+        - Expect all masked tokens to be replaced with `[MASK]`. No tokens are left unchanged or replaced with random tokens.
+
+    3. No `[MASK]` replacement, only random tokens:
+        - `mask_replace_prob=0.0`, `random_replace_prob=1.0`.
+        - Expect all masked tokens to be replaced with random tokens. No `[MASK]` replacements or unchanged tokens.
+
+    4. Balanced replacement:
+        - `mask_replace_prob=0.5`, `random_replace_prob=0.4`.
+        - Expect 50% of masked tokens replaced with `[MASK]`, 40% replaced with random tokens, and 10% left unchanged.
+
+    Note:
+        The sum of `mask_replace_prob` and `random_replace_prob` must not exceed 1. If their sum is less than 1, the
+        remaining proportion will consist of masked tokens left unchanged.
+
+    </Tip>
+    """
+
+    tokenizer: PreTrainedTokenizerBase
+    mlm: bool = True
+    whole_word_mask: bool = False
+    mlm_probability: Optional[float] = 0.15
+    mask_replace_prob: float = 0.8
+    random_replace_prob: float = 0.1
+    pad_to_multiple_of: Optional[int] = None
+    tf_experimental_compile: bool = False
+    return_tensors: str = "pt"
+    seed: Optional[int] = None
+
+    def __post_init__(self):
+        if self.mlm:
+            if self.tokenizer.mask_token is None:
+                raise ValueError(
+                    "This tokenizer does not have a mask token which is necessary for masked language modeling. "
+                    "You should pass `mlm=False` to train on causal language modeling instead."
+                )
+            if self.mlm_probability is None or self.mlm_probability < 0 or self.mlm_probability > 1:
+                raise ValueError("mlm_probability should be between 0 and 1.")
+            self.mlm_probability = float(self.mlm_probability)
+        elif self.whole_word_mask:
+            raise ValueError(
+                "Whole word masking can only be used with mlm=True."
+                "If you want to use whole word masking, please set mlm=True."
+            )
+        if self.mask_replace_prob + self.random_replace_prob > 1:
+            raise ValueError("The sum of mask_replace_prob and random_replace_prob should not exceed 1")
+        if self.mask_replace_prob < 0 or self.mask_replace_prob > 1:
+            raise ValueError("mask_replace_prob should be between 0 and 1.")
+        if self.random_replace_prob < 0 or self.random_replace_prob > 1:
+            raise ValueError("random_replace_prob should be between 0 and 1.")
+
+        self.mask_replace_prob = float(self.mask_replace_prob)
+        self.random_replace_prob = float(self.random_replace_prob)
+
+        if self.tf_experimental_compile:
+            import tensorflow as tf
+
+            self.tf_mask_tokens = tf.function(self.tf_mask_tokens, jit_compile=True)
+        if self.whole_word_mask:
+            if not self.tokenizer.is_fast:
+                warnings.warn(
+                    "Whole word masking depends on offset mapping which is only natively available with fast tokenizers.",
+                    UserWarning,
+                )
+
+            if self.mask_replace_prob < 1:
+                warnings.warn(
+                    "Random token replacement is not supported with whole word masking.",
+                    "Setting mask_replace_prob to 1.",
+                )
+                self.mask_replace_prob = 1
+                self.random_replace_prob = 0
+
+        self.generator = None
+
+    def get_generator(self, seed):
+        if self.return_tensors == "pt":
+            import torch
+
+            return torch.Generator().manual_seed(seed)
+        elif self.return_tensors == "tf":
+            import tensorflow as tf
+
+            return tf.random.Generator.from_seed(seed)
+        else:
+            return np.random.default_rng(seed)
+
+    def create_rng(self):
+        if mp.current_process().name == "MainProcess":
+            # If we are in the main process, we create a generator object with the seed
+            self.generator = self.get_generator(self.seed)
+        else:
+            # If we are in a worker process (i.e using multiprocessing), we need to set a unique seed for each
+            # worker's generator, generated as the main seed + the worker's ID.
+            # (https://pytorch.org/docs/stable/data.html#randomness-in-multi-process-data-loading)
+            # Only PyTorch DataLoader allows us to access the worker ID, and so we check for this.
+            # For other frameworks, we will throw an error.
+            import torch
+
+            worker_info = torch.utils.data.get_worker_info()
+            if worker_info is None:
+                error_string = (
+                    "Worker process information is not available for seeding the generator. This may be because",
+                    "you are using multiprocessing without using a PyTorch DataLoader. The `seed` parameter can",
+                    "only be used when using multiprocessing with a PyTorch DataLoader. Please either use a",
+                    "single process or use a PyTorch DataLoader with multiple workers.",
+                )
+                raise ValueError(error_string)
+
+            self.generator = self.get_generator(self.seed + worker_info.id)
+
+    @staticmethod
+    def tf_bernoulli(shape, probability, generator=None):
+        import tensorflow as tf
+
+        prob_matrix = tf.fill(shape, probability)
+        # if generator exists, use it to generate the random numbers
+        # otherwise, use the global RNG
+        if generator:
+            return tf.cast(prob_matrix - generator.uniform(shape, 0, 1) >= 0, tf.bool)
+        else:
+            return tf.cast(prob_matrix - tf.random.uniform(shape, 0, 1) >= 0, tf.bool)
+
+    def tf_mask_tokens(
+        self, inputs: Any, vocab_size, mask_token_id, special_tokens_mask: Optional[Any] = None
+    ) -> tuple[Any, Any]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
+        """
+        import tensorflow as tf
+
+        mask_token_id = tf.cast(mask_token_id, inputs.dtype)
+
+        input_shape = tf.shape(inputs)
+        # 1 for a special token, 0 for a normal token in the special tokens mask
+        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
+        masked_indices = self.tf_bernoulli(input_shape, self.mlm_probability, self.generator) & ~special_tokens_mask
+        # Replace unmasked indices with -100 in the labels since we only compute loss on masked tokens
+        labels = tf.where(masked_indices, inputs, -100)
+
+        # mask_replace_prob% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = self.tf_bernoulli(input_shape, self.mask_replace_prob, self.generator) & masked_indices
+
+        inputs = tf.where(indices_replaced, mask_token_id, inputs)
+
+        if self.mask_replace_prob == 1 or self.random_replace_prob == 0:
+            return inputs, labels
+
+        remaining_prob = 1 - self.mask_replace_prob
+        # scaling the random_replace_prob to the remaining probability for example if
+        # mask_replace_prob = 0.8 and random_replace_prob = 0.1,
+        # then random_replace_prob_scaled = 0.1 / 0.2 = 0.5
+        random_replace_prob_scaled = self.random_replace_prob / remaining_prob
+        # random_replace_prob% of the time, we replace masked input tokens with random word
+        indices_random = (
+            self.tf_bernoulli(input_shape, random_replace_prob_scaled, self.generator)
+            & masked_indices
+            & ~indices_replaced
+        )
+
+        if self.generator:
+            random_words = self.generator.uniform(input_shape, maxval=vocab_size, dtype=inputs.dtype)
+        else:
+            random_words = tf.random.uniform(input_shape, maxval=vocab_size, dtype=inputs.dtype)
+
+        inputs = tf.where(indices_random, random_words, inputs)
+
+        # The rest of the time ((1-random_replace_prob-mask_replace_prob)% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
+
+    def tf_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
+        import tensorflow as tf
+
+        if self.seed and self.generator is None:
+            # If we have a seed, we need to create a generator object. Subsequent calls to this function will use the same generator.
+            # If no seed supplied, we will use the global RNG
+            self.create_rng()
+
+        # Handle dict or lists with proper padding and conversion to tensor.
+        if isinstance(examples[0], Mapping):
+            batch = pad_without_fast_tokenizer_warning(
+                self.tokenizer, examples, return_tensors="tf", pad_to_multiple_of=self.pad_to_multiple_of
+            )
+        else:
+            batch = {
+                "input_ids": _tf_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
+            }
+
+        # If special token mask has been preprocessed, pop it from the dict.
+        special_tokens_mask = batch.pop("special_tokens_mask", None)
+        if self.mlm:
+            if special_tokens_mask is None:
+                special_tokens_mask = [
+                    self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
+                    for val in batch["input_ids"].numpy().tolist()
+                ]
+                # Cannot directly create as bool
+                special_tokens_mask = tf.cast(tf.convert_to_tensor(special_tokens_mask, dtype=tf.int64), tf.bool)
+            else:
+                special_tokens_mask = tf.cast(special_tokens_mask, tf.bool)
+            batch["input_ids"], batch["labels"] = self.tf_mask_tokens(
+                tf.cast(batch["input_ids"], tf.int64),
+                special_tokens_mask=special_tokens_mask,
+                mask_token_id=self.tokenizer.mask_token_id,
+                vocab_size=len(self.tokenizer),
+            )
+        else:
+            labels = batch["input_ids"]
+            if self.tokenizer.pad_token_id is not None:
+                # Replace self.tokenizer.pad_token_id with -100
+                labels = tf.where(labels == self.tokenizer.pad_token_id, -100, labels)
+            else:
+                labels = tf.identity(labels)  # Makes a copy, just in case
+            batch["labels"] = labels
+        return batch
+
+    def torch_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
+        # Handle dict or lists with proper padding and conversion to tensor.
+
+        if self.seed and self.generator is None:
+            # If we have a seed, we need to create a generator object. Subsequent calls to this function will use the same generator.
+            # If no seed supplied, we will use the global RNG
+            self.create_rng()
+
+        if isinstance(examples[0], Mapping):
+            batch = pad_without_fast_tokenizer_warning(
+                self.tokenizer, examples, return_tensors="pt", pad_to_multiple_of=self.pad_to_multiple_of
+            )
+        else:
+            batch = {
+                "input_ids": _torch_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
+            }
+
+        # If special token mask has been preprocessed, pop it from the dict.
+        special_tokens_mask = batch.pop("special_tokens_mask", None)
+        offset_mapping = batch.pop("offset_mapping", None)
+        if self.mlm:
+            batch["input_ids"], batch["labels"] = self.torch_mask_tokens(
+                batch["input_ids"], special_tokens_mask=special_tokens_mask, offset_mapping=offset_mapping
+            )
+        else:
+            labels = batch["input_ids"].clone()
+            if self.tokenizer.pad_token_id is not None:
+                labels[labels == self.tokenizer.pad_token_id] = -100
+            batch["labels"] = labels
+        return batch
+
+    def torch_mask_tokens(
+        self, inputs: Any, special_tokens_mask: Optional[Any] = None, offset_mapping: Optional[Any] = None
+    ) -> tuple[Any, Any]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling.
+        """
+        import torch
+
+        labels = inputs.clone()
+        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
+        probability_matrix = torch.full(labels.shape, self.mlm_probability)
+        if special_tokens_mask is None:
+            special_tokens_mask = [
+                self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
+            ]
+
+        if self.whole_word_mask:
+            word_ids, no_mask_mask = self._calc_word_ids_and_prob_mask(
+                to_numpy(offset_mapping), to_numpy(special_tokens_mask)
+            )
+            no_mask_mask = torch.tensor(no_mask_mask, dtype=torch.bool)
+        else:
+            no_mask_mask = (
+                special_tokens_mask.bool()
+                if isinstance(special_tokens_mask, torch.Tensor)
+                else torch.tensor(special_tokens_mask, dtype=torch.bool)
+            )
+
+        probability_matrix.masked_fill_(no_mask_mask, value=0.0)
+        masked_indices = torch.bernoulli(probability_matrix, generator=self.generator).bool()
+        if self.whole_word_mask:
+            masked_indices = torch.BoolTensor(self._whole_word_mask(word_ids, masked_indices))
+
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+
+        # mask_replace_prob% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = (
+            torch.bernoulli(torch.full(labels.shape, self.mask_replace_prob), generator=self.generator).bool()
+            & masked_indices
+        )
+        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
+
+        if self.mask_replace_prob == 1 or self.random_replace_prob == 0:
+            return inputs, labels
+
+        remaining_prob = 1 - self.mask_replace_prob
+        # scaling the random_replace_prob to the remaining probability for example if
+        # mask_replace_prob = 0.8 and random_replace_prob = 0.1,
+        # then random_replace_prob_scaled = 0.1 / 0.2 = 0.5
+        random_replace_prob_scaled = self.random_replace_prob / remaining_prob
+
+        # random_replace_prob% of the time, we replace masked input tokens with random word
+        indices_random = (
+            torch.bernoulli(torch.full(labels.shape, random_replace_prob_scaled), generator=self.generator).bool()
+            & masked_indices
+            & ~indices_replaced
+        )
+        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long, generator=self.generator)
+        inputs[indices_random] = random_words[indices_random]
+
+        # The rest of the time ((1-random_replace_prob-mask_replace_prob)% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
+
+    def numpy_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
+        # Handle dict or lists with proper padding and conversion to tensor.
+
+        if self.seed and self.generator is None:
+            # If we have a seed, we need to create a generator object. Subsequent calls to this function will use the same generator.
+            # If no seed supplied, we will use the global RNG
+            self.create_rng()
+
+        if isinstance(examples[0], Mapping):
+            batch = pad_without_fast_tokenizer_warning(
+                self.tokenizer, examples, return_tensors="np", pad_to_multiple_of=self.pad_to_multiple_of
+            )
+        else:
+            batch = {
+                "input_ids": _numpy_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
+            }
+
+        # If special token mask has been preprocessed, pop it from the dict.
+        special_tokens_mask = batch.pop("special_tokens_mask", None)
+        offset_mapping = batch.pop("offset_mapping", None)
+        if self.mlm:
+            batch["input_ids"], batch["labels"] = self.numpy_mask_tokens(
+                batch["input_ids"], special_tokens_mask=special_tokens_mask, offset_mapping=offset_mapping
+            )
+        else:
+            labels = np.copy(batch["input_ids"])
+            if self.tokenizer.pad_token_id is not None:
+                labels[labels == self.tokenizer.pad_token_id] = -100
+            batch["labels"] = labels
+        return batch
+
+    def numpy_mask_tokens(
+        self,
+        inputs: Any,
+        special_tokens_mask: Optional[Any] = None,
+        offset_mapping: Optional[Any] = None,
+    ) -> tuple[Any, Any]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling.
+        """
+        labels = np.copy(inputs)
+        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
+        probability_matrix = np.full(labels.shape, self.mlm_probability)
+        if special_tokens_mask is None:
+            special_tokens_mask = [
+                self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
+            ]
+
+        if self.whole_word_mask:
+            word_ids, no_mask_mask = self._calc_word_ids_and_prob_mask(
+                to_numpy(offset_mapping), to_numpy(special_tokens_mask)
+            )
+        else:
+            no_mask_mask = (
+                special_tokens_mask.astype(bool)
+                if isinstance(special_tokens_mask, np.ndarray)
+                else np.array(special_tokens_mask, dtype=bool)
+            )
+
+        probability_matrix[no_mask_mask] = 0
+        # Numpy doesn't have bernoulli, so we use a binomial with 1 trial
+        if self.generator:
+            masked_indices = self.generator.binomial(1, probability_matrix, size=probability_matrix.shape).astype(bool)
+        else:
+            masked_indices = np.random.binomial(1, probability_matrix, size=probability_matrix.shape).astype(bool)
+
+        if self.whole_word_mask:
+            masked_indices = self._whole_word_mask(word_ids, masked_indices)
+
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+
+        # mask_replace_prob% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        if self.generator:
+            indices_replaced = (
+                self.generator.binomial(1, self.mask_replace_prob, size=labels.shape).astype(bool) & masked_indices
+            )
+        else:
+            indices_replaced = (
+                np.random.binomial(1, self.mask_replace_prob, size=labels.shape).astype(bool) & masked_indices
+            )
+        inputs[indices_replaced] = self.tokenizer.mask_token_id
+
+        if self.mask_replace_prob == 1 or self.random_replace_prob == 0:
+            return inputs, labels
+
+        remaining_prob = 1 - self.mask_replace_prob
+        # scaling the random_replace_prob to the remaining probability for example if
+        # mask_replace_prob = 0.8 and random_replace_prob = 0.1,
+        # then random_replace_prob_scaled = 0.1 / 0.2 = 0.5
+        random_replace_prob_scaled = self.random_replace_prob / remaining_prob
+        if self.generator:
+            indices_random = (
+                self.generator.binomial(1, random_replace_prob_scaled, size=labels.shape).astype(bool)
+                & masked_indices
+                & ~indices_replaced
+            )
+            random_words = self.generator.integers(
+                low=0, high=len(self.tokenizer), size=np.count_nonzero(indices_random), dtype=np.int64
+            )
+        else:
+            indices_random = (
+                np.random.binomial(1, random_replace_prob_scaled, size=labels.shape).astype(bool)
+                & masked_indices
+                & ~indices_replaced
+            )
+            random_words = np.random.randint(
+                low=0, high=len(self.tokenizer), size=np.count_nonzero(indices_random), dtype=np.int64
+            )
+        inputs[indices_random] = random_words
+
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
+
+    @staticmethod
+    def _calc_word_ids_and_prob_mask(
+        offsets: np.ndarray[np.ndarray[tuple[int, int]]], special_tokens_mask: np.ndarray[np.ndarray[int]]
+    ) -> tuple[np.ndarray[np.ndarray[int]], np.ndarray[np.ndarray[int]]]:
+        """
+        Map tokens to word ids and create mask of tokens to not mask.
+        Tokens that are part of the same word will have the same word id and we will only
+        set a mask probability for the first token of each word.
+        """
+
+        token_starts = offsets[:, :, 0]
+        token_ends = offsets[:, :, 1]
+
+        prev_token_ends = np.roll(token_ends, 1, axis=1)
+        prev_token_ends[:, 0] = -1  # First token has no previous token
+
+        prev_token_special = np.roll(special_tokens_mask, 1, axis=1)
+        prev_token_special[:, 0] = 0
+
+        # Not special token AND (gap from previous or previous token was special)
+        special_tokens_mask = special_tokens_mask.astype(bool)
+        is_new_word = (~special_tokens_mask) & ((token_starts != prev_token_ends) | (prev_token_special == 1))
+
+        word_ids = np.cumsum(is_new_word, axis=1)
+        word_ids[special_tokens_mask] = -1
+
+        prob_mask = ~is_new_word
+
+        return word_ids, prob_mask
+
+    @staticmethod
+    def _whole_word_mask(word_ids: np.ndarray[np.ndarray[int]], mask: Any) -> Any:
+        """
+        Mask whole words based on word ids and mask.
+        """
+        mask = to_numpy(mask)
+
+        valid_ids = word_ids != -1
+
+        # Create 3D mask where [batch, token_i, token_j] is True if token_i and token_j are the same word
+        same_word = (word_ids[:, :, None] == word_ids[:, None, :]) & valid_ids[:, :, None] & valid_ids[:, None, :]
+
+        # For each token, set True if any token in the same word is masked
+        return np.any(same_word & mask[:, None, :], axis=2)
+
+
+@dataclass
+class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
+    """
+    Data collator used for language modeling that masks entire words.
+
+    - collates batches of tensors, honoring their tokenizer's pad_token
+    - preprocesses batches for masked language modeling
+
+    <Tip>
+
+    This collator relies on details of the implementation of subword tokenization by [`BertTokenizer`], specifically
+    that subword tokens are prefixed with *##*. For tokenizers that do not adhere to this scheme, this collator will
+    produce an output that is roughly equivalent to [`.DataCollatorForLanguageModeling`].
+
+    </Tip>"""
+
+    def torch_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
+        if self.seed and self.generator is None:
+            # If we have a seed, we need to create a generator object. Subsequent calls to this function will use the same generator.
+            # If no seed supplied, we will use the global RNG
+            self.create_rng()
+
+        if isinstance(examples[0], Mapping):
+            input_ids = [e["input_ids"] for e in examples]
+        else:
+            input_ids = examples
+            examples = [{"input_ids": e} for e in examples]
+
+        batch_input = _torch_collate_batch(input_ids, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
+
+        mask_labels = []
+        for e in examples:
+            ref_tokens = []
+            for id in tolist(e["input_ids"]):
+                token = self.tokenizer._convert_id_to_token(id)
+                ref_tokens.append(token)
+
+            # For Chinese tokens, we need extra inf to mark sub-word, e.g [喜,欢]-> [喜，##欢]
+            if "chinese_ref" in e:
+                ref_pos = tolist(e["chinese_ref"])
+                len_seq = len(e["input_ids"])
+                for i in range(len_seq):
+                    if i in ref_pos:
+                        ref_tokens[i] = "##" + ref_tokens[i]
+            mask_labels.append(self._whole_word_mask(ref_tokens))
+        batch_mask = _torch_collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
+        inputs, labels = self.torch_mask_tokens(batch_input, batch_mask)
+        return {"input_ids": inputs, "labels": labels}
+
+    def tf_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
+        import tensorflow as tf
+
+        if self.seed and self.generator is None:
+            # If we have a seed, we need to create a generator object. Subsequent calls to this function will use the same generator.
+            # If no seed supplied, we will use the global RNG
+            self.create_rng()
+
+        if isinstance(examples[0], Mapping):
+            input_ids = [e["input_ids"] for e in examples]
+        else:
+            input_ids = examples
+            examples = [{"input_ids": e} for e in examples]
+
+        batch_input = _tf_collate_batch(input_ids, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
+
+        mask_labels = []
+        for e in examples:
+            ref_tokens = []
+            for id in tolist(e["input_ids"]):
+                token = self.tokenizer._convert_id_to_token(id)
+                ref_tokens.append(token)
+
+            # For Chinese tokens, we need extra inf to mark sub-word, e.g [喜,欢]-> [喜，##欢]
+            if "chinese_ref" in e:
+                ref_pos = tolist(e["chinese_ref"])
+                len_seq = len(e["input_ids"])
+                for i in range(len_seq):
+                    if i in ref_pos:
+                        ref_tokens[i] = "##" + ref_tokens[i]
+            mask_labels.append(self._whole_word_mask(ref_tokens))
+        batch_mask = _tf_collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
+        inputs, labels = self.tf_mask_tokens(tf.cast(batch_input, tf.int64), batch_mask)
+        return {"input_ids": inputs, "labels": labels}
+
+    def numpy_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
+        if self.seed and self.generator is None:
+            # If we have a seed, we need to create a generator object. Subsequent calls to this function will use the same generator.
+            # If no seed supplied, we will use the global RNG
+            self.create_rng()
+
+        if isinstance(examples[0], Mapping):
+            input_ids = [e["input_ids"] for e in examples]
+        else:
+            input_ids = examples
+            examples = [{"input_ids": e} for e in examples]
+
+        batch_input = _numpy_collate_batch(input_ids, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
+
+        mask_labels = []
+        for e in examples:
+            ref_tokens = []
+            for id in tolist(e["input_ids"]):
+                token = self.tokenizer._convert_id_to_token(id)
+                ref_tokens.append(token)
+
+            # For Chinese tokens, we need extra inf to mark sub-word, e.g [喜,欢]-> [喜，##欢]
+            if "chinese_ref" in e:
+                ref_pos = tolist(e["chinese_ref"])
+                len_seq = len(e["input_ids"])
+                for i in range(len_seq):
+                    if i in ref_pos:
+                        ref_tokens[i] = "##" + ref_tokens[i]
+            mask_labels.append(self._whole_word_mask(ref_tokens))
+        batch_mask = _numpy_collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
+        inputs, labels = self.numpy_mask_tokens(batch_input, batch_mask)
+        return {"input_ids": inputs, "labels": labels}
+
+    def _shuffle(self, cand_indexes):
+        # if no seed, just use random's shuffle
+        if self.seed is None:
+            random.shuffle(cand_indexes)
+            return cand_indexes
+
+        # if seed is provided, use the generator to shuffle
+        if self.return_tensors == "pt":
+            import torch
+
+            indices = torch.randperm(len(cand_indexes), generator=self.generator)
+            return [cand_indexes[i] for i in indices]
+
+        elif self.return_tensors == "tf":
+            import tensorflow as tf
+
+            seed = self.generator.make_seeds(2)[0]
+            indices = tf.random.experimental.stateless_shuffle(tf.range(len(cand_indexes)), seed=seed).numpy().tolist()
+            return [cand_indexes[i] for i in indices]
+
+        elif self.return_tensors == "np":
+            self.generator.shuffle(cand_indexes)
+            return cand_indexes
+
+    def _whole_word_mask(self, input_tokens: list[str], max_predictions=512):
+        """
+        Get 0/1 labels for masked tokens with whole word mask proxy
+        """
+        from transformers import BertTokenizer, BertTokenizerFast
+
+        if not isinstance(self.tokenizer, (BertTokenizer, BertTokenizerFast)):
+            warnings.warn(
+                "DataCollatorForWholeWordMask is only suitable for BertTokenizer-like tokenizers. "
+                "Please refer to the documentation for more information."
+            )
+
+        cand_indexes = []
+        for i, token in enumerate(input_tokens):
+            if token == "[CLS]" or token == "[SEP]":
+                continue
+
+            if len(cand_indexes) >= 1 and token.startswith("##"):
+                cand_indexes[-1].append(i)
+            else:
+                cand_indexes.append([i])
+
+        cand_indexes = self._shuffle(cand_indexes)
+        num_to_predict = min(max_predictions, max(1, int(round(len(input_tokens) * self.mlm_probability))))
+        masked_lms = []
+        covered_indexes = set()
+        for index_set in cand_indexes:
+            if len(masked_lms) >= num_to_predict:
+                break
+            # If adding a whole-word mask would exceed the maximum number of
+            # predictions, then just skip this candidate.
+            if len(masked_lms) + len(index_set) > num_to_predict:
+                continue
+            for index in index_set:
+                covered_indexes.add(index)
+                masked_lms.append(index)
+
+        if len(covered_indexes) != len(masked_lms):
+            raise ValueError("Length of covered_indexes is not equal to length of masked_lms.")
+        mask_labels = [1 if i in covered_indexes else 0 for i in range(len(input_tokens))]
+        return mask_labels
+
+    def torch_mask_tokens(self, inputs: Any, mask_labels: Any) -> tuple[Any, Any]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
+        'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
+        """
+        import torch
+
+        if self.tokenizer.mask_token is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the"
+                " --mlm flag if you want to use this tokenizer."
+            )
+        labels = inputs.clone()
+        # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
+
+        probability_matrix = mask_labels
+
+        special_tokens_mask = [
+            self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
+        ]
+        probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
+        if self.tokenizer.pad_token is not None:
+            padding_mask = labels.eq(self.tokenizer.pad_token_id)
+            probability_matrix.masked_fill_(padding_mask, value=0.0)
+
+        masked_indices = probability_matrix.bool()
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+
+        # mask_replace_prob% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = (
+            torch.bernoulli(torch.full(labels.shape, self.mask_replace_prob), generator=self.generator).bool()
+            & masked_indices
+        )
+        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
+
+        if self.mask_replace_prob == 1 or self.random_replace_prob == 0:
+            return inputs, labels
+
+        remaining_prob = 1 - self.mask_replace_prob
+        # scaling the random_replace_prob to the remaining probability for example if
+        # mask_replace_prob = 0.8 and random_replace_prob = 0.1,
+        # then random_replace_prob_scaled = 0.1 / 0.2 = 0.5
+        random_replace_prob_scaled = self.random_replace_prob / remaining_prob
+
+        # random_replacement_prob% of the time, we replace masked input tokens with random word
+        indices_random = (
+            torch.bernoulli(torch.full(labels.shape, random_replace_prob_scaled), generator=self.generator).bool()
+            & masked_indices
+            & ~indices_replaced
+        )
+        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long, generator=self.generator)
+        inputs[indices_random] = random_words[indices_random]
+
+        # The rest of the time ((1-random_replacement_prob-mask_replace_prob)% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
+
+    def tf_mask_tokens(self, inputs: Any, mask_labels: Any) -> tuple[Any, Any]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
+        'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
+        """
+        import tensorflow as tf
+
+        input_shape = tf.shape(inputs)
+        if self.tokenizer.mask_token is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the"
+                " --mlm flag if you want to use this tokenizer."
+            )
+        labels = tf.identity(inputs)
+        # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
+
+        masked_indices = tf.cast(mask_labels, tf.bool)
+
+        special_tokens_mask = [
+            self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels
+        ]
+        masked_indices = masked_indices & ~tf.cast(special_tokens_mask, dtype=tf.bool)
+        if self.tokenizer.pad_token is not None:
+            padding_mask = inputs == self.tokenizer.pad_token_id
+            masked_indices = masked_indices & ~padding_mask
+
+        # Replace unmasked indices with -100 in the labels since we only compute loss on masked tokens
+        labels = tf.where(masked_indices, inputs, -100)
+
+        # mask_replace_prob% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = self.tf_bernoulli(input_shape, self.mask_replace_prob, self.generator) & masked_indices
+
+        inputs = tf.where(indices_replaced, self.tokenizer.mask_token_id, inputs)
+
+        if self.mask_replace_prob == 1 or self.random_replace_prob == 0:
+            return inputs, labels
+
+        remaining_prob = 1 - self.mask_replace_prob
+        # scaling the random_replace_prob to the remaining probability for example if
+        # mask_replace_prob = 0.8 and random_replace_prob = 0.1,
+        # then random_replace_prob_scaled = 0.1 / 0.2 = 0.5
+        random_replace_prob_scaled = self.random_replace_prob / remaining_prob
+
+        # random_replace_prob% of the time, we replace masked input tokens with random word
+        indices_random = (
+            self.tf_bernoulli(input_shape, random_replace_prob_scaled, self.generator)
+            & masked_indices
+            & ~indices_replaced
+        )
+
+        if self.generator:
+            random_words = self.generator.uniform(input_shape, maxval=len(self.tokenizer), dtype=tf.int64)
+        else:
+            random_words = tf.random.uniform(input_shape, maxval=len(self.tokenizer), dtype=tf.int64)
+
+        inputs = tf.where(indices_random, random_words, inputs)
+
+        # The rest of the time ((1-mask_replace_prob-random_replace_prob)% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
+
+    def numpy_mask_tokens(self, inputs: Any, mask_labels: Any) -> tuple[Any, Any]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
+        'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
+        """
+        if self.tokenizer.mask_token is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the"
+                " --mlm flag if you want to use this tokenizer."
+            )
+        labels = np.copy(inputs)
+        # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
+
+        masked_indices = mask_labels.astype(bool)
+
+        special_tokens_mask = [
+            self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
+        ]
+        masked_indices[np.array(special_tokens_mask, dtype=bool)] = 0
+        if self.tokenizer.pad_token is not None:
+            padding_mask = labels == self.tokenizer.pad_token_id
+            masked_indices[padding_mask] = 0
+
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+
+        # mask_replacement_prob% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        if self.generator:
+            indices_replaced = (
+                self.generator.binomial(1, self.mask_replace_prob, size=labels.shape).astype(bool) & masked_indices
+            )
+        else:
+            indices_replaced = (
+                np.random.binomial(1, self.mask_replace_prob, size=labels.shape).astype(bool) & masked_indices
+            )
+        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
+
+        if self.mask_replace_prob == 1 or self.random_replace_prob == 0:
+            return inputs, labels
+
+        remaining_prob = 1 - self.mask_replace_prob
+        # scaling the random_replace_prob to the remaining probability for example if
+        # mask_replace_prob = 0.8 and random_replace_prob = 0.1,
+        # then random_replace_prob_scaled = 0.1 / 0.2 = 0.5
+        random_replace_prob_scaled = self.random_replace_prob / remaining_prob
+
+        if self.generator:
+            indices_random = (
+                self.generator.binomial(1, random_replace_prob_scaled, size=labels.shape).astype(bool)
+                & masked_indices
+                & ~indices_replaced
+            )
+            random_words = self.generator.integers(low=0, high=len(self.tokenizer), size=labels.shape, dtype=np.int64)
+        else:
+            indices_random = (
+                np.random.binomial(1, random_replace_prob_scaled, size=labels.shape).astype(bool)
+                & masked_indices
+                & ~indices_replaced
+            )
+            random_words = np.random.randint(low=0, high=len(self.tokenizer), size=labels.shape, dtype=np.int64)
+
+        inputs[indices_random] = random_words[indices_random]
+
+        # The rest of the time ((1-mask_replace_prob-random_replace_prob)% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
+
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "DataCollatorForWholeWordMask is deprecated and will be removed in a future version, you can now use "
+            "DataCollatorForLanguageModeling with whole_word_mask=True instead.",
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+        self.mlm = True  # Force masked language modeling
+        self.whole_word_mask = True  # Force whole word masking
+
+
+def tolist(x) -> list[Any]:
+    if isinstance(x, list):
+        return x
+    elif hasattr(x, "numpy"):  # Checks for TF tensors without needing the import
+        x = x.numpy()
+    return x.tolist()
+
+
+def to_numpy(x) -> np.ndarray[Any]:
+    if isinstance(x, np.ndarray):
+        return x
+    elif hasattr(x, "detach"):
+        return x.detach().cpu().numpy()
+    else:
+        return np.array(x)
+
+
+@dataclass
+class DataCollatorForSOP(DataCollatorForLanguageModeling):
+    """
+    Data collator used for sentence order prediction task.
+
+    - collates batches of tensors, honoring their tokenizer's pad_token
+    - preprocesses batches for both masked language modeling and sentence order prediction
+    """
+
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "DataCollatorForSOP is deprecated and will be removed in a future version, you can now use "
+            "DataCollatorForLanguageModeling instead.",
+            FutureWarning,
+        )
+
+    def __call__(self, examples: list[dict[str, Any]]) -> dict[str, Any]:
+        import torch
+        from torch.nn.utils.rnn import pad_sequence
+
+        input_ids = [example["input_ids"] for example in examples]
+        input_ids = _torch_collate_batch(input_ids, self.tokenizer)
+        input_ids, labels, attention_mask = self.mask_tokens(input_ids)
+
+        token_type_ids = [example["token_type_ids"] for example in examples]
+        # size of segment_ids varied because randomness, padding zero to the end as the original implementation
+        token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
+
+        sop_label_list = [example["sentence_order_label"] for example in examples]
+        sentence_order_label = torch.stack(sop_label_list)
+
+        return {
+            "input_ids": input_ids,
+            "labels": labels,
+            "attention_mask": attention_mask,
+            "token_type_ids": token_type_ids,
+            "sentence_order_label": sentence_order_label,
+        }
+
+    def mask_tokens(self, inputs: Any) -> tuple[Any, Any, Any]:
+        """
+        Prepare masked tokens inputs/labels/attention_mask for masked language modeling: 80% MASK, 10% random, 10%
+        original. N-gram not applied yet.
+        """
+        import torch
+
+        if self.tokenizer.mask_token is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the"
+                " --mlm flag if you want to use this tokenizer."
+            )
+
+        labels = inputs.clone()
+        # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
+        probability_matrix = torch.full(labels.shape, self.mlm_probability)
+        special_tokens_mask = [
+            self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
+        ]
+        probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
+        if self.tokenizer.pad_token is not None:
+            padding_mask = labels.eq(self.tokenizer.pad_token_id)
+            probability_matrix.masked_fill_(padding_mask, value=0.0)
+        masked_indices = torch.bernoulli(probability_matrix).bool()
+        # probability be `1` (masked), however in albert model attention mask `0` means masked, revert the value
+        attention_mask = (~masked_indices).float()
+        if self.tokenizer.pad_token is not None:
+            attention_padding_mask = labels.eq(self.tokenizer.pad_token_id)
+            attention_mask.masked_fill_(attention_padding_mask, value=1.0)
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens, -100 is default for CE compute
+
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
+        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
+
+        # 10% of the time, we replace masked input tokens with random word
+        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
+        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
+        inputs[indices_random] = random_words[indices_random]
+
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels, attention_mask
+
+
+@dataclass
+class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
+    """
+    Data collator used for permutation language modeling.
+
+    - collates batches of tensors, honoring their tokenizer's pad_token
+    - preprocesses batches for permutation language modeling with procedures specific to XLNet
+    """
+
+    tokenizer: PreTrainedTokenizerBase
+    plm_probability: float = 1 / 6
+    max_span_length: int = 5  # maximum length of a span of masked tokens
+    return_tensors: str = "pt"
+
+    def torch_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
+        if isinstance(examples[0], Mapping):
+            examples = [e["input_ids"] for e in examples]
+        batch = _torch_collate_batch(examples, self.tokenizer)
+        inputs, perm_mask, target_mapping, labels = self.torch_mask_tokens(batch)
+        return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}
+
+    def tf_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
+        if isinstance(examples[0], Mapping):
+            examples = [e["input_ids"] for e in examples]
+        batch = _tf_collate_batch(examples, self.tokenizer)
+        inputs, perm_mask, target_mapping, labels = self.tf_mask_tokens(batch)
+        return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}
+
+    def numpy_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
+        if isinstance(examples[0], Mapping):
+            examples = [e["input_ids"] for e in examples]
+        batch = _numpy_collate_batch(examples, self.tokenizer)
+        inputs, perm_mask, target_mapping, labels = self.numpy_mask_tokens(batch)
+        return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}
+
+    def torch_mask_tokens(self, inputs: Any) -> tuple[Any, Any, Any, Any]:
+        """
+        The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
+
+            0. Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
+            1. Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be masked)
+            2. Reserve a context of length `context_length = span_length / plm_probability` to surround span to be
+               masked
+            3. Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length -
+               span_length]` and mask tokens `start_index:start_index + span_length`
+            4. Set `cur_len = cur_len + context_length`. If `cur_len < max_len` (i.e. there are tokens remaining in the
+               sequence to be processed), repeat from Step 1.
+        """
+        import torch
+
+        if self.tokenizer.mask_token is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for permutation language modeling."
+                " Please add a mask token if you want to use this tokenizer."
+            )
+
+        if inputs.size(1) % 2 != 0:
+            raise ValueError(
+                "This collator requires that sequence lengths be even to create a leakage-free perm_mask. Please see"
+                " relevant comments in source code for details."
+            )
+
+        labels = inputs.clone()
+        # Creating the mask and target_mapping tensors
+        masked_indices = torch.full(labels.shape, 0, dtype=torch.bool)
+        target_mapping = torch.zeros((labels.size(0), labels.size(1), labels.size(1)), dtype=torch.float32)
+
+        for i in range(labels.size(0)):
+            # Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
+            cur_len = 0
+            max_len = labels.size(1)
+
+            while cur_len < max_len:
+                # Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be masked)
+                span_length = torch.randint(1, self.max_span_length + 1, (1,)).item()
+                # Reserve a context of length `context_length = span_length / plm_probability` to surround the span to be masked
+                context_length = int(span_length / self.plm_probability)
+                # Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length - span_length]` and mask tokens `start_index:start_index + span_length`
+                start_index = cur_len + torch.randint(context_length - span_length + 1, (1,)).item()
+                masked_indices[i, start_index : start_index + span_length] = 1
+                # Set `cur_len = cur_len + context_length`
+                cur_len += context_length
+
+            # Since we're replacing non-masked tokens with -100 in the labels tensor instead of skipping them altogether,
+            # the i-th predict corresponds to the i-th token.
+            target_mapping[i] = torch.eye(labels.size(1))
+
+        special_tokens_mask = torch.tensor(
+            [self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()],
+            dtype=torch.bool,
+        )
+        masked_indices.masked_fill_(special_tokens_mask, value=0.0)
+        if self.tokenizer.pad_token is not None:
+            padding_mask = labels.eq(self.tokenizer.pad_token_id)
+            masked_indices.masked_fill_(padding_mask, value=0.0)
+
+        # Mask indicating non-functional tokens, where functional tokens are [SEP], [CLS], padding, etc.
+        non_func_mask = ~(padding_mask | special_tokens_mask)
+
+        inputs[masked_indices] = self.tokenizer.mask_token_id
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+
+        perm_mask = torch.zeros((labels.size(0), labels.size(1), labels.size(1)), dtype=torch.float32)
+
+        for i in range(labels.size(0)):
+            # Generate permutation indices i.e. sample a random factorisation order for the sequence. This will
+            # determine which tokens a given token can attend to (encoded in `perm_mask`).
+            # Note: Length of token sequence being permuted has to be less than or equal to reused sequence length
+            # (see documentation for `mems`), otherwise information may leak through due to reuse. In this implementation,
+            # we assume that reused length is half of sequence length and permutation length is equal to reused length.
+            # This requires that the sequence length be even.
+
+            # Create a linear factorisation order
+            perm_index = torch.arange(labels.size(1))
+            # Split this into two halves, assuming that half the sequence is reused each time
+            perm_index = perm_index.reshape((-1, labels.size(1) // 2)).transpose(0, 1)
+            # Permute the two halves such that they do not cross over
+            perm_index = perm_index[torch.randperm(labels.size(1) // 2)]
+            # Flatten this out into the desired permuted factorisation order
+            perm_index = torch.flatten(perm_index.transpose(0, 1))
+            # Set the permutation indices of non-masked (non-functional) tokens to the
+            # smallest index (-1) so that:
+            # (1) They can be seen by all other positions
+            # (2) They cannot see masked positions, so there won't be information leak
+            perm_index.masked_fill_(~masked_indices[i] & non_func_mask[i], -1)
+            # The logic for whether the i-th token can attend on the j-th token based on the factorisation order:
+            # 0 (can attend): If perm_index[i] > perm_index[j] or j is neither masked nor a functional token
+            # 1 (cannot attend): If perm_index[i] <= perm_index[j] and j is either masked or a functional token
+            perm_mask[i] = (
+                perm_index.reshape((labels.size(1), 1)) <= perm_index.reshape((1, labels.size(1)))
+            ) & masked_indices[i]
+
+        return inputs.long(), perm_mask, target_mapping, labels.long()
+
+    def tf_mask_tokens(self, inputs: Any) -> tuple[Any, Any, Any, Any]:
+        """
+        The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
+
+            0. Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
+            1. Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be masked)
+            2. Reserve a context of length `context_length = span_length / plm_probability` to surround span to be
+               masked
+            3. Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length -
+               span_length]` and mask tokens `start_index:start_index + span_length`
+            4. Set `cur_len = cur_len + context_length`. If `cur_len < max_len` (i.e. there are tokens remaining in the
+               sequence to be processed), repeat from Step 1.
+        """
+        import tensorflow as tf
+
+        if self.tokenizer.mask_token is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for permutation language modeling."
+                " Please add a mask token if you want to use this tokenizer."
+            )
+
+        if tf.shape(inputs)[1] % 2 != 0:
+            raise ValueError(
+                "This collator requires that sequence lengths be even to create a leakage-free perm_mask. Please see"
+                " relevant comments in source code for details."
+            )
+
+        labels = tf.identity(inputs)
+        # Creating the mask and target_mapping tensors
+        masked_indices = np.full(labels.shape.as_list(), 0, dtype=bool)
+        labels_shape = tf.shape(labels)
+        target_mapping = np.zeros((labels_shape[0], labels_shape[1], labels_shape[1]), dtype=np.float32)
+
+        for i in range(len(labels)):
+            # Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
+            cur_len = 0
+            max_len = tf.shape(labels)[1]
+
+            while cur_len < max_len:
+                # Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be masked)
+                span_length = randint(1, self.max_span_length + 1)
+                # Reserve a context of length `context_length = span_length / plm_probability` to surround the span to be masked
+                context_length = int(span_length / self.plm_probability)
+                # Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length - span_length]` and mask tokens `start_index:start_index + span_length`
+                start_index = cur_len + randint(0, context_length - span_length + 1)
+                masked_indices[i, start_index : start_index + span_length] = 1
+                # Set `cur_len = cur_len + context_length`
+                cur_len += context_length
+
+            # Since we're replacing non-masked tokens with -100 in the labels tensor instead of skipping them altogether,
+            # the i-th predict corresponds to the i-th token.
+            target_mapping[i] = np.eye(labels_shape[1])
+        masked_indices = tf.cast(tf.convert_to_tensor(masked_indices), dtype=tf.bool)
+        target_mapping = tf.convert_to_tensor(target_mapping)
+        special_tokens_mask = tf.convert_to_tensor(
+            [
+                self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
+                for val in labels.numpy().tolist()
+            ],
+        )
+        special_tokens_mask = tf.cast(special_tokens_mask, dtype=tf.bool)
+        masked_indices = masked_indices & ~special_tokens_mask
+        if self.tokenizer.pad_token is not None:
+            padding_mask = labels == self.tokenizer.pad_token_id
+            masked_indices = masked_indices & ~padding_mask
+
+        # Mask indicating non-functional tokens, where functional tokens are [SEP], [CLS], padding, etc.
+        non_func_mask = ~(padding_mask | special_tokens_mask)
+
+        inputs = tf.where(masked_indices, self.tokenizer.mask_token_id, inputs)
+        labels = tf.where(masked_indices, labels, -100)  # We only compute loss on masked tokens
+
+        perm_mask = []
+
+        for i in range(len(labels)):
+            # Generate permutation indices i.e. sample a random factorisation order for the sequence. This will
+            # determine which tokens a given token can attend to (encoded in `perm_mask`).
+            # Note: Length of token sequence being permuted has to be less than or equal to reused sequence length
+            # (see documentation for `mems`), otherwise information may leak through due to reuse. In this implementation,
+            # we assume that reused length is half of sequence length and permutation length is equal to reused length.
+            # This requires that the sequence length be even.
+
+            # Create a linear factorisation order
+            # tf.range is the equivalent of torch.arange
+            perm_index = tf.range(labels_shape[1])
+            # Split this into two halves, assuming that half the sequence is reused each time
+            perm_index = tf.transpose(tf.reshape(perm_index, (-1, labels_shape[1] // 2)))
+            # Permute the two halves such that they do not cross over
+            perm_index = tf.random.shuffle(perm_index)  # Shuffles along the first dimension
+            # Flatten this out into the desired permuted factorisation order
+            perm_index = tf.reshape(tf.transpose(perm_index), (-1,))
+            # Set the permutation indices of non-masked (non-functional) tokens to the
+            # smallest index (-1) so that:
+            # (1) They can be seen by all other positions
+            # (2) They cannot see masked positions, so there won't be information leak
+            perm_index = tf.where(~masked_indices[i] & non_func_mask[i], -1, perm_index)
+            # The logic for whether the i-th token can attend on the j-th token based on the factorisation order:
+            # 0 (can attend): If perm_index[i] > perm_index[j] or j is neither masked nor a functional token
+            # 1 (cannot attend): If perm_index[i] <= perm_index[j] and j is either masked or a functional token
+            perm_mask.append(
+                (tf.reshape(perm_index, (labels_shape[1], 1)) <= tf.reshape(perm_index, (1, labels_shape[1])))
+                & masked_indices[i]
+            )
+        perm_mask = tf.stack(perm_mask, axis=0)
+
+        return tf.cast(inputs, tf.int64), tf.cast(perm_mask, tf.float32), target_mapping, tf.cast(labels, tf.int64)
+
+    def numpy_mask_tokens(self, inputs: Any) -> tuple[Any, Any, Any, Any]:
+        """
+        The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
+
+            0. Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
+            1. Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be masked)
+            2. Reserve a context of length `context_length = span_length / plm_probability` to surround span to be
+               masked
+            3. Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length -
+               span_length]` and mask tokens `start_index:start_index + span_length`
+            4. Set `cur_len = cur_len + context_length`. If `cur_len < max_len` (i.e. there are tokens remaining in the
+               sequence to be processed), repeat from Step 1.
+        """
+        if self.tokenizer.mask_token is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for permutation language modeling."
+                " Please add a mask token if you want to use this tokenizer."
+            )
+
+        if inputs.shape[1] % 2 != 0:
+            raise ValueError(
+                "This collator requires that sequence lengths be even to create a leakage-free perm_mask. Please see"
+                " relevant comments in source code for details."
+            )
+
+        labels = np.copy(inputs)
+        # Creating the mask and target_mapping tensors
+        masked_indices = np.full(labels.shape, 0, dtype=bool)
+        target_mapping = np.zeros((labels.shape[0], labels.shape[1], labels.shape[1]), dtype=np.float32)
+
+        for i in range(labels.shape[0]):
+            # Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
+            cur_len = 0
+            max_len = labels.shape[1]
+
+            while cur_len < max_len:
+                # Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be masked)
+                span_length = randint(1, self.max_span_length + 1)
+                # Reserve a context of length `context_length = span_length / plm_probability` to surround the span to be masked
+                context_length = int(span_length / self.plm_probability)
+                # Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length - span_length]` and mask tokens `start_index:start_index + span_length`
+                start_index = cur_len + randint(0, context_length - span_length + 1)
+                masked_indices[i, start_index : start_index + span_length] = 1
+                # Set `cur_len = cur_len + context_length`
+                cur_len += context_length
+
+            # Since we're replacing non-masked tokens with -100 in the labels tensor instead of skipping them altogether,
+            # the i-th predict corresponds to the i-th token.
+            target_mapping[i] = np.eye(labels.shape[1])
+
+        special_tokens_mask = np.array(
+            [self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()],
+            dtype=bool,
+        )
+        masked_indices[special_tokens_mask] = 0
+        if self.tokenizer.pad_token is not None:
+            padding_mask = labels == self.tokenizer.pad_token_id
+            masked_indices[padding_mask] = 0.0
+
+        # Mask indicating non-functional tokens, where functional tokens are [SEP], [CLS], padding, etc.
+        non_func_mask = ~(padding_mask | special_tokens_mask)
+
+        inputs[masked_indices] = self.tokenizer.mask_token_id
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+
+        perm_mask = np.zeros((labels.shape[0], labels.shape[1], labels.shape[1]), dtype=np.float32)
+
+        for i in range(labels.shape[0]):
+            # Generate permutation indices i.e. sample a random factorisation order for the sequence. This will
+            # determine which tokens a given token can attend to (encoded in `perm_mask`).
+            # Note: Length of token sequence being permuted has to be less than or equal to reused sequence length
+            # (see documentation for `mems`), otherwise information may leak through due to reuse. In this implementation,
+            # we assume that reused length is half of sequence length and permutation length is equal to reused length.
+            # This requires that the sequence length be even.
+
+            # Create a linear factorisation order
+            perm_index = np.arange(labels.shape[1])
+            # Split this into two halves, assuming that half the sequence is reused each time
+            perm_index = perm_index.reshape((-1, labels.shape[1] // 2)).T
+            # Permute the two halves such that they do not cross over
+            np.random.shuffle(perm_index)
+            # Flatten this out into the desired permuted factorisation order
+            perm_index = perm_index.T.flatten()
+            # Set the permutation indices of non-masked (non-functional) tokens to the
+            # smallest index (-1) so that:
+            # (1) They can be seen by all other positions
+            # (2) They cannot see masked positions, so there won't be information leak
+            perm_index[~masked_indices[i] & non_func_mask[i]] = -1
+            # The logic for whether the i-th token can attend on the j-th token based on the factorisation order:
+            # 0 (can attend): If perm_index[i] > perm_index[j] or j is neither masked nor a functional token
+            # 1 (cannot attend): If perm_index[i] <= perm_index[j] and j is either masked or a functional token
+            perm_mask[i] = (
+                perm_index.reshape((labels.shape[1], 1)) <= perm_index.reshape((1, labels.shape[1]))
+            ) & masked_indices[i]
+
+        return inputs.astype(np.int64), perm_mask, target_mapping, labels.astype(np.int64)
+
+
+@dataclass
+class DataCollatorWithFlattening(DefaultDataCollator):
+    """
+    Data collator used for padding free approach. Does the following:
+
+    - concatenates the entire mini batch into single long sequence of shape [1, total_tokens]
+    - uses `separator_id` to separate sequences within the concatenated `labels`, default value is -100
+    - no padding will be added, returns `input_ids`, `labels` and `position_ids` by default
+    - optionally returns the kwargs contained in FlashAttentionKwargs
+    - optionally returns seq_idx indicating which sequence each token belongs to
+
+    <Tip warning={true}>
+
+    Using `DataCollatorWithFlattening` will flatten the entire mini batch into single long sequence.
+    Make sure your attention computation is able to handle it!
+
+    </Tip>
+    """
+
+    def __init__(
+        self,
+        *args,
+        return_position_ids=True,
+        separator_id=-100,
+        return_flash_attn_kwargs=False,
+        return_seq_idx=False,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.return_position_ids = return_position_ids
+        self.separator_id = separator_id
+        self.return_flash_attn_kwargs = return_flash_attn_kwargs
+        self.return_seq_idx = return_seq_idx
+        self._int_64_keys = {"labels", "position_ids", "input_ids"}
+        self._batch_dim_keys = {"labels", "position_ids", "input_ids", "seq_idx"}
+        self._py_int_keys = {"max_length_q", "max_length_k"}
+
+    def __call__(self, features, return_tensors=None, separator_id=None):
+        if return_tensors is None:
+            return_tensors = self.return_tensors
+        if separator_id is None:
+            separator_id = self.separator_id
+        is_labels_provided = "labels" in features[0]
+        batch = {"input_ids": [], "labels": []}
+        if self.return_position_ids:
+            batch.update({"position_ids": []})
+        if self.return_seq_idx:
+            batch.update({"seq_idx": []})
+        if self.return_flash_attn_kwargs:
+            cu_seq_lens = [0]
+            max_length = 0
+        for seq_idx, sample in enumerate(features):
+            input_ids = sample["input_ids"]
+            batch["input_ids"] += input_ids
+            if is_labels_provided:
+                batch["labels"] += [separator_id] + sample["labels"][1:]
+            else:
+                batch["labels"] += [separator_id] + input_ids[1:]
+            if self.return_position_ids:
+                batch["position_ids"] += list(range(len(input_ids)))
+            if self.return_seq_idx:
+                batch["seq_idx"] += [seq_idx for _ in range(len(input_ids))]
+            if self.return_flash_attn_kwargs:
+                cu_seq_lens.append(cu_seq_lens[-1] + len(input_ids))
+                max_length = max(max_length, len(input_ids))
+
+        if self.return_flash_attn_kwargs:
+            batch["cu_seq_lens_q"] = batch["cu_seq_lens_k"] = cu_seq_lens
+            batch["max_length_q"] = batch["max_length_k"] = max_length
+
+        # FlashAttentionKwargs and seq_idx are expected to be int32s.
+        if return_tensors == "pt":
+            import torch
+
+            data_cls = torch.tensor
+            dtype_64 = torch.int64
+            dtype_32 = torch.int32
+        elif return_tensors == "np":
+            data_cls = np.array
+            dtype_64 = np.int64
+            dtype_32 = np.int32
+        else:
+            raise ValueError(f'return_tensors must be one of ("pt", "np"), {return_tensors=} not supported')
+
+        for k, v in batch.items():
+            if k in self._batch_dim_keys:
+                v = [v]
+            # Flash attention max_len_{q,k} are python ints
+            if k not in self._py_int_keys:
+                batch[k] = data_cls(v, dtype=dtype_64 if k in self._int_64_keys else dtype_32)
+
+        return batch
diff --git a/venv/lib/python3.13/site-packages/transformers/data/datasets/__init__.py b/venv/lib/python3.13/site-packages/transformers/data/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..378894ab4bbb4704b67b1de4ab512f145b889d46
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/data/datasets/__init__.py
@@ -0,0 +1,23 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .glue import GlueDataset, GlueDataTrainingArguments
+from .language_modeling import (
+    LineByLineTextDataset,
+    LineByLineWithRefDataset,
+    LineByLineWithSOPTextDataset,
+    TextDataset,
+    TextDatasetForNextSentencePrediction,
+)
+from .squad import SquadDataset, SquadDataTrainingArguments
diff --git a/venv/lib/python3.13/site-packages/transformers/data/datasets/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/data/datasets/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..090ed3e7697994feb142abcca9ff5d7531f69219
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/data/datasets/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/data/datasets/__pycache__/glue.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/data/datasets/__pycache__/glue.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19f6c8b87d121e28087312c66a656b87fc4041d0
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/data/datasets/__pycache__/glue.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/data/datasets/__pycache__/language_modeling.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/data/datasets/__pycache__/language_modeling.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dae22c3faa9b7f6a92a2e7da67e8241d788b8634
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/data/datasets/__pycache__/language_modeling.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/data/datasets/__pycache__/squad.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/data/datasets/__pycache__/squad.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..681d18d3c6bc0d244d816c62b54960af01ce6174
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/data/datasets/__pycache__/squad.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/data/datasets/glue.py b/venv/lib/python3.13/site-packages/transformers/data/datasets/glue.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8db0dfebac1a8432d18320df4f1f4eba4eb4030
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/data/datasets/glue.py
@@ -0,0 +1,162 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import warnings
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Optional, Union
+
+import torch
+from filelock import FileLock
+from torch.utils.data import Dataset
+
+from ...tokenization_utils_base import PreTrainedTokenizerBase
+from ...utils import check_torch_load_is_safe, logging
+from ..processors.glue import glue_convert_examples_to_features, glue_output_modes, glue_processors
+from ..processors.utils import InputFeatures
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+class GlueDataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+
+    Using `HfArgumentParser` we can turn this class into argparse arguments to be able to specify them on the command
+    line.
+    """
+
+    task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(glue_processors.keys())})
+    data_dir: str = field(
+        metadata={"help": "The input data dir. Should contain the .tsv files (or other data files) for the task."}
+    )
+    max_seq_length: int = field(
+        default=128,
+        metadata={
+            "help": (
+                "The maximum total input sequence length after tokenization. Sequences longer "
+                "than this will be truncated, sequences shorter will be padded."
+            )
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+
+    def __post_init__(self):
+        self.task_name = self.task_name.lower()
+
+
+class Split(Enum):
+    train = "train"
+    dev = "dev"
+    test = "test"
+
+
+class GlueDataset(Dataset):
+    """
+    This will be superseded by a framework-agnostic approach soon.
+    """
+
+    args: GlueDataTrainingArguments
+    output_mode: str
+    features: list[InputFeatures]
+
+    def __init__(
+        self,
+        args: GlueDataTrainingArguments,
+        tokenizer: PreTrainedTokenizerBase,
+        limit_length: Optional[int] = None,
+        mode: Union[str, Split] = Split.train,
+        cache_dir: Optional[str] = None,
+    ):
+        warnings.warn(
+            "This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets "
+            "library. You can have a look at this example script for pointers: "
+            "https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py",
+            FutureWarning,
+        )
+        self.args = args
+        self.processor = glue_processors[args.task_name]()
+        self.output_mode = glue_output_modes[args.task_name]
+        if isinstance(mode, str):
+            try:
+                mode = Split[mode]
+            except KeyError:
+                raise KeyError("mode is not a valid split name")
+        # Load data features from cache or dataset file
+        cached_features_file = os.path.join(
+            cache_dir if cache_dir is not None else args.data_dir,
+            f"cached_{mode.value}_{tokenizer.__class__.__name__}_{args.max_seq_length}_{args.task_name}",
+        )
+        label_list = self.processor.get_labels()
+        if args.task_name in ["mnli", "mnli-mm"] and tokenizer.__class__.__name__ in (
+            "RobertaTokenizer",
+            "RobertaTokenizerFast",
+            "XLMRobertaTokenizer",
+            "BartTokenizer",
+            "BartTokenizerFast",
+        ):
+            # HACK(label indices are swapped in RoBERTa pretrained model)
+            label_list[1], label_list[2] = label_list[2], label_list[1]
+        self.label_list = label_list
+
+        # Make sure only the first process in distributed training processes the dataset,
+        # and the others will use the cache.
+        lock_path = cached_features_file + ".lock"
+        with FileLock(lock_path):
+            if os.path.exists(cached_features_file) and not args.overwrite_cache:
+                start = time.time()
+                check_torch_load_is_safe()
+                self.features = torch.load(cached_features_file, weights_only=True)
+                logger.info(
+                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
+                )
+            else:
+                logger.info(f"Creating features from dataset file at {args.data_dir}")
+
+                if mode == Split.dev:
+                    examples = self.processor.get_dev_examples(args.data_dir)
+                elif mode == Split.test:
+                    examples = self.processor.get_test_examples(args.data_dir)
+                else:
+                    examples = self.processor.get_train_examples(args.data_dir)
+                if limit_length is not None:
+                    examples = examples[:limit_length]
+                self.features = glue_convert_examples_to_features(
+                    examples,
+                    tokenizer,
+                    max_length=args.max_seq_length,
+                    label_list=label_list,
+                    output_mode=self.output_mode,
+                )
+                start = time.time()
+                torch.save(self.features, cached_features_file)
+                # ^ This seems to take a lot of time so I want to investigate why and how we can improve.
+                logger.info(
+                    f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]"
+                )
+
+    def __len__(self):
+        return len(self.features)
+
+    def __getitem__(self, i) -> InputFeatures:
+        return self.features[i]
+
+    def get_labels(self):
+        return self.label_list
diff --git a/venv/lib/python3.13/site-packages/transformers/data/datasets/language_modeling.py b/venv/lib/python3.13/site-packages/transformers/data/datasets/language_modeling.py
new file mode 100644
index 0000000000000000000000000000000000000000..07250ef3cb5402603c75ed2c1a4c2e2200fb3dbe
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/data/datasets/language_modeling.py
@@ -0,0 +1,530 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import pickle
+import random
+import time
+import warnings
+from typing import Optional
+
+import torch
+from filelock import FileLock
+from torch.utils.data import Dataset
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+DEPRECATION_WARNING = (
+    "This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets "
+    "library. You can have a look at this example script for pointers: {0}"
+)
+
+
+class TextDataset(Dataset):
+    """
+    This will be superseded by a framework-agnostic approach soon.
+    """
+
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        file_path: str,
+        block_size: int,
+        overwrite_cache=False,
+        cache_dir: Optional[str] = None,
+    ):
+        warnings.warn(
+            DEPRECATION_WARNING.format(
+                "https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py"
+            ),
+            FutureWarning,
+        )
+        if os.path.isfile(file_path) is False:
+            raise ValueError(f"Input file path {file_path} not found")
+
+        block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False)
+
+        directory, filename = os.path.split(file_path)
+        cached_features_file = os.path.join(
+            cache_dir if cache_dir is not None else directory,
+            f"cached_lm_{tokenizer.__class__.__name__}_{block_size}_{filename}",
+        )
+
+        # Make sure only the first process in distributed training processes the dataset,
+        # and the others will use the cache.
+        lock_path = cached_features_file + ".lock"
+        with FileLock(lock_path):
+            if os.path.exists(cached_features_file) and not overwrite_cache:
+                start = time.time()
+                with open(cached_features_file, "rb") as handle:
+                    self.examples = pickle.load(handle)
+                logger.info(
+                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
+                )
+
+            else:
+                logger.info(f"Creating features from dataset file at {directory}")
+
+                self.examples = []
+                with open(file_path, encoding="utf-8") as f:
+                    text = f.read()
+
+                tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
+
+                for i in range(0, len(tokenized_text) - block_size + 1, block_size):  # Truncate in block of block_size
+                    self.examples.append(
+                        tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size])
+                    )
+                # Note that we are losing the last truncated example here for the sake of simplicity (no padding)
+                # If your dataset is small, first you should look for a bigger one :-) and second you
+                # can change this behavior by adding (model specific) padding.
+
+                start = time.time()
+                with open(cached_features_file, "wb") as handle:
+                    pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
+                logger.info(
+                    f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]"
+                )
+
+    def __len__(self):
+        return len(self.examples)
+
+    def __getitem__(self, i) -> torch.Tensor:
+        return torch.tensor(self.examples[i], dtype=torch.long)
+
+
+class LineByLineTextDataset(Dataset):
+    """
+    This will be superseded by a framework-agnostic approach soon.
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int):
+        warnings.warn(
+            DEPRECATION_WARNING.format(
+                "https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py"
+            ),
+            FutureWarning,
+        )
+        if os.path.isfile(file_path) is False:
+            raise ValueError(f"Input file path {file_path} not found")
+        # Here, we do not cache the features, operating under the assumption
+        # that we will soon use fast multithreaded tokenizers from the
+        # `tokenizers` repo everywhere =)
+        logger.info(f"Creating features from dataset file at {file_path}")
+
+        with open(file_path, encoding="utf-8") as f:
+            lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
+
+        batch_encoding = tokenizer(lines, add_special_tokens=True, truncation=True, max_length=block_size)
+        self.examples = batch_encoding["input_ids"]
+        self.examples = [{"input_ids": torch.tensor(e, dtype=torch.long)} for e in self.examples]
+
+    def __len__(self):
+        return len(self.examples)
+
+    def __getitem__(self, i) -> dict[str, torch.tensor]:
+        return self.examples[i]
+
+
+class LineByLineWithRefDataset(Dataset):
+    """
+    This will be superseded by a framework-agnostic approach soon.
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, ref_path: str):
+        warnings.warn(
+            DEPRECATION_WARNING.format(
+                "https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm_wwm.py"
+            ),
+            FutureWarning,
+        )
+        if os.path.isfile(file_path) is False:
+            raise ValueError(f"Input file path {file_path} not found")
+        if os.path.isfile(ref_path) is False:
+            raise ValueError(f"Ref file path {file_path} not found")
+        # Here, we do not cache the features, operating under the assumption
+        # that we will soon use fast multithreaded tokenizers from the
+        # `tokenizers` repo everywhere =)
+        logger.info(f"Creating features from dataset file at {file_path}")
+        logger.info(f"Use ref segment results at {ref_path}")
+        with open(file_path, encoding="utf-8") as f:
+            data = f.readlines()  # use this method to avoid delimiter '\u2029' to split a line
+        data = [line.strip() for line in data if len(line) > 0 and not line.isspace()]
+        # Get ref inf from file
+        with open(ref_path, encoding="utf-8") as f:
+            ref = [json.loads(line) for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
+        if len(data) != len(ref):
+            raise ValueError(
+                f"Length of Input file should be equal to Ref file. But the length of {file_path} is {len(data)} "
+                f"while length of {ref_path} is {len(ref)}"
+            )
+
+        batch_encoding = tokenizer(data, add_special_tokens=True, truncation=True, max_length=block_size)
+        self.examples = batch_encoding["input_ids"]
+        self.examples = [{"input_ids": torch.tensor(e, dtype=torch.long)} for e in self.examples]
+
+        n = len(self.examples)
+        for i in range(n):
+            self.examples[i]["chinese_ref"] = torch.tensor(ref[i], dtype=torch.long)
+
+    def __len__(self):
+        return len(self.examples)
+
+    def __getitem__(self, i) -> dict[str, torch.tensor]:
+        return self.examples[i]
+
+
+class LineByLineWithSOPTextDataset(Dataset):
+    """
+    Dataset for sentence order prediction task, prepare sentence pairs for SOP task
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizer, file_dir: str, block_size: int):
+        warnings.warn(
+            DEPRECATION_WARNING.format(
+                "https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py"
+            ),
+            FutureWarning,
+        )
+        if os.path.isdir(file_dir) is False:
+            raise ValueError(f"{file_dir} is not a directory")
+        logger.info(f"Creating features from dataset file folder at {file_dir}")
+        self.examples = []
+        # TODO: randomness could apply a random seed, ex. rng = random.Random(random_seed)
+        # file path looks like ./dataset/wiki_1, ./dataset/wiki_2
+        for file_name in os.listdir(file_dir):
+            file_path = os.path.join(file_dir, file_name)
+            if os.path.isfile(file_path) is False:
+                raise ValueError(f"{file_path} is not a file")
+            article_open = False
+            with open(file_path, encoding="utf-8") as f:
+                original_lines = f.readlines()
+                article_lines = []
+                for line in original_lines:
+                    if "<doc id=" in line:
+                        article_open = True
+                    elif "</doc>" in line:
+                        article_open = False
+                        document = [
+                            tokenizer.convert_tokens_to_ids(tokenizer.tokenize(line))
+                            for line in article_lines[1:]
+                            if (len(line) > 0 and not line.isspace())
+                        ]
+
+                        examples = self.create_examples_from_document(document, block_size, tokenizer)
+                        self.examples.extend(examples)
+                        article_lines = []
+                    else:
+                        if article_open:
+                            article_lines.append(line)
+
+        logger.info("Dataset parse finished.")
+
+    def create_examples_from_document(self, document, block_size, tokenizer, short_seq_prob=0.1):
+        """Creates examples for a single document."""
+
+        # Account for special tokens
+        max_num_tokens = block_size - tokenizer.num_special_tokens_to_add(pair=True)
+
+        # We *usually* want to fill up the entire sequence since we are padding
+        # to `block_size` anyways, so short sequences are generally wasted
+        # computation. However, we *sometimes*
+        # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
+        # sequences to minimize the mismatch between pretraining and fine-tuning.
+        # The `target_seq_length` is just a rough target however, whereas
+        # `block_size` is a hard limit.
+        target_seq_length = max_num_tokens
+        if random.random() < short_seq_prob:
+            target_seq_length = random.randint(2, max_num_tokens)
+
+        # We DON'T just concatenate all of the tokens from a document into a long
+        # sequence and choose an arbitrary split point because this would make the
+        # next sentence prediction task too easy. Instead, we split the input into
+        # segments "A" and "B" based on the actual "sentences" provided by the user
+        # input.
+        examples = []
+        current_chunk = []  # a buffer stored current working segments
+        current_length = 0
+        i = 0
+        while i < len(document):
+            segment = document[i]  # get a segment
+            if not segment:
+                i += 1
+                continue
+            current_chunk.append(segment)  # add a segment to current chunk
+            current_length += len(segment)  # overall token length
+            # if current length goes to the target length or reaches the end of file, start building token a and b
+            if i == len(document) - 1 or current_length >= target_seq_length:
+                if current_chunk:
+                    # `a_end` is how many segments from `current_chunk` go into the `A` (first) sentence.
+                    a_end = 1
+                    # if current chunk has more than 2 sentences, pick part of it `A` (first) sentence
+                    if len(current_chunk) >= 2:
+                        a_end = random.randint(1, len(current_chunk) - 1)
+                    # token a
+                    tokens_a = []
+                    for j in range(a_end):
+                        tokens_a.extend(current_chunk[j])
+
+                    # token b
+                    tokens_b = []
+                    for j in range(a_end, len(current_chunk)):
+                        tokens_b.extend(current_chunk[j])
+
+                    if len(tokens_a) == 0 or len(tokens_b) == 0:
+                        continue
+
+                    # switch tokens_a and tokens_b randomly
+                    if random.random() < 0.5:
+                        is_next = False
+                        tokens_a, tokens_b = tokens_b, tokens_a
+                    else:
+                        is_next = True
+
+                    def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens):
+                        """Truncates a pair of sequences to a maximum sequence length."""
+                        while True:
+                            total_length = len(tokens_a) + len(tokens_b)
+                            if total_length <= max_num_tokens:
+                                break
+                            trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
+                            if not (len(trunc_tokens) >= 1):
+                                raise ValueError("Sequence length to be truncated must be no less than one")
+                            # We want to sometimes truncate from the front and sometimes from the
+                            # back to add more randomness and avoid biases.
+                            if random.random() < 0.5:
+                                del trunc_tokens[0]
+                            else:
+                                trunc_tokens.pop()
+
+                    truncate_seq_pair(tokens_a, tokens_b, max_num_tokens)
+                    if not (len(tokens_a) >= 1):
+                        raise ValueError(f"Length of sequence a is {len(tokens_a)} which must be no less than 1")
+                    if not (len(tokens_b) >= 1):
+                        raise ValueError(f"Length of sequence b is {len(tokens_b)} which must be no less than 1")
+
+                    # add special tokens
+                    input_ids = tokenizer.build_inputs_with_special_tokens(tokens_a, tokens_b)
+                    # add token type ids, 0 for sentence a, 1 for sentence b
+                    token_type_ids = tokenizer.create_token_type_ids_from_sequences(tokens_a, tokens_b)
+
+                    example = {
+                        "input_ids": torch.tensor(input_ids, dtype=torch.long),
+                        "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
+                        "sentence_order_label": torch.tensor(0 if is_next else 1, dtype=torch.long),
+                    }
+                    examples.append(example)
+                current_chunk = []  # clear current chunk
+                current_length = 0  # reset current text length
+            i += 1  # go to next line
+        return examples
+
+    def __len__(self):
+        return len(self.examples)
+
+    def __getitem__(self, i) -> dict[str, torch.tensor]:
+        return self.examples[i]
+
+
+class TextDatasetForNextSentencePrediction(Dataset):
+    """
+    This will be superseded by a framework-agnostic approach soon.
+    """
+
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        file_path: str,
+        block_size: int,
+        overwrite_cache=False,
+        short_seq_probability=0.1,
+        nsp_probability=0.5,
+    ):
+        warnings.warn(
+            DEPRECATION_WARNING.format(
+                "https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py"
+            ),
+            FutureWarning,
+        )
+        if not os.path.isfile(file_path):
+            raise ValueError(f"Input file path {file_path} not found")
+
+        self.short_seq_probability = short_seq_probability
+        self.nsp_probability = nsp_probability
+
+        directory, filename = os.path.split(file_path)
+        cached_features_file = os.path.join(
+            directory,
+            f"cached_nsp_{tokenizer.__class__.__name__}_{block_size}_{filename}",
+        )
+
+        self.tokenizer = tokenizer
+
+        # Make sure only the first process in distributed training processes the dataset,
+        # and the others will use the cache.
+        lock_path = cached_features_file + ".lock"
+
+        # Input file format:
+        # (1) One sentence per line. These should ideally be actual sentences, not
+        # entire paragraphs or arbitrary spans of text. (Because we use the
+        # sentence boundaries for the "next sentence prediction" task).
+        # (2) Blank lines between documents. Document boundaries are needed so
+        # that the "next sentence prediction" task doesn't span between documents.
+        #
+        # Example:
+        # I am very happy.
+        # Here is the second sentence.
+        #
+        # A new document.
+
+        with FileLock(lock_path):
+            if os.path.exists(cached_features_file) and not overwrite_cache:
+                start = time.time()
+                with open(cached_features_file, "rb") as handle:
+                    self.examples = pickle.load(handle)
+                logger.info(
+                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
+                )
+            else:
+                logger.info(f"Creating features from dataset file at {directory}")
+
+                self.documents = [[]]
+                with open(file_path, encoding="utf-8") as f:
+                    while True:
+                        line = f.readline()
+                        if not line:
+                            break
+                        line = line.strip()
+
+                        # Empty lines are used as document delimiters
+                        if not line and len(self.documents[-1]) != 0:
+                            self.documents.append([])
+                        tokens = tokenizer.tokenize(line)
+                        tokens = tokenizer.convert_tokens_to_ids(tokens)
+                        if tokens:
+                            self.documents[-1].append(tokens)
+
+                logger.info(f"Creating examples from {len(self.documents)} documents.")
+                self.examples = []
+                for doc_index, document in enumerate(self.documents):
+                    self.create_examples_from_document(document, doc_index, block_size)
+
+                start = time.time()
+                with open(cached_features_file, "wb") as handle:
+                    pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
+                logger.info(
+                    f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]"
+                )
+
+    def create_examples_from_document(self, document: list[list[int]], doc_index: int, block_size: int):
+        """Creates examples for a single document."""
+
+        max_num_tokens = block_size - self.tokenizer.num_special_tokens_to_add(pair=True)
+
+        # We *usually* want to fill up the entire sequence since we are padding
+        # to `block_size` anyways, so short sequences are generally wasted
+        # computation. However, we *sometimes*
+        # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
+        # sequences to minimize the mismatch between pretraining and fine-tuning.
+        # The `target_seq_length` is just a rough target however, whereas
+        # `block_size` is a hard limit.
+        target_seq_length = max_num_tokens
+        if random.random() < self.short_seq_probability:
+            target_seq_length = random.randint(2, max_num_tokens)
+
+        current_chunk = []  # a buffer stored current working segments
+        current_length = 0
+        i = 0
+
+        while i < len(document):
+            segment = document[i]
+            current_chunk.append(segment)
+            current_length += len(segment)
+            if i == len(document) - 1 or current_length >= target_seq_length:
+                if current_chunk:
+                    # `a_end` is how many segments from `current_chunk` go into the `A`
+                    # (first) sentence.
+                    a_end = 1
+                    if len(current_chunk) >= 2:
+                        a_end = random.randint(1, len(current_chunk) - 1)
+
+                    tokens_a = []
+                    for j in range(a_end):
+                        tokens_a.extend(current_chunk[j])
+
+                    tokens_b = []
+
+                    if len(current_chunk) == 1 or random.random() < self.nsp_probability:
+                        is_random_next = True
+                        target_b_length = target_seq_length - len(tokens_a)
+
+                        # This should rarely go for more than one iteration for large
+                        # corpora. However, just to be careful, we try to make sure that
+                        # the random document is not the same as the document
+                        # we're processing.
+                        for _ in range(10):
+                            random_document_index = random.randint(0, len(self.documents) - 1)
+                            if random_document_index != doc_index:
+                                break
+
+                        random_document = self.documents[random_document_index]
+                        random_start = random.randint(0, len(random_document) - 1)
+                        for j in range(random_start, len(random_document)):
+                            tokens_b.extend(random_document[j])
+                            if len(tokens_b) >= target_b_length:
+                                break
+                        # We didn't actually use these segments so we "put them back" so
+                        # they don't go to waste.
+                        num_unused_segments = len(current_chunk) - a_end
+                        i -= num_unused_segments
+                    # Actual next
+                    else:
+                        is_random_next = False
+                        for j in range(a_end, len(current_chunk)):
+                            tokens_b.extend(current_chunk[j])
+
+                    if not (len(tokens_a) >= 1):
+                        raise ValueError(f"Length of sequence a is {len(tokens_a)} which must be no less than 1")
+                    if not (len(tokens_b) >= 1):
+                        raise ValueError(f"Length of sequence b is {len(tokens_b)} which must be no less than 1")
+
+                    # add special tokens
+                    input_ids = self.tokenizer.build_inputs_with_special_tokens(tokens_a, tokens_b)
+                    # add token type ids, 0 for sentence a, 1 for sentence b
+                    token_type_ids = self.tokenizer.create_token_type_ids_from_sequences(tokens_a, tokens_b)
+
+                    example = {
+                        "input_ids": torch.tensor(input_ids, dtype=torch.long),
+                        "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
+                        "next_sentence_label": torch.tensor(1 if is_random_next else 0, dtype=torch.long),
+                    }
+
+                    self.examples.append(example)
+
+                current_chunk = []
+                current_length = 0
+
+            i += 1
+
+    def __len__(self):
+        return len(self.examples)
+
+    def __getitem__(self, i):
+        return self.examples[i]
diff --git a/venv/lib/python3.13/site-packages/transformers/data/datasets/squad.py b/venv/lib/python3.13/site-packages/transformers/data/datasets/squad.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4f76a51f422b00ffce5215b65c9274ce5646489
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/data/datasets/squad.py
@@ -0,0 +1,230 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Optional, Union
+
+import torch
+from filelock import FileLock
+from torch.utils.data import Dataset
+
+from ...models.auto.modeling_auto import MODEL_FOR_QUESTION_ANSWERING_MAPPING
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import check_torch_load_is_safe, logging
+from ..processors.squad import SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features
+
+
+logger = logging.get_logger(__name__)
+
+MODEL_CONFIG_CLASSES = list(MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+@dataclass
+class SquadDataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    model_type: str = field(
+        default=None, metadata={"help": "Model type selected in the list: " + ", ".join(MODEL_TYPES)}
+    )
+    data_dir: str = field(
+        default=None, metadata={"help": "The input data dir. Should contain the .json files for the SQuAD task."}
+    )
+    max_seq_length: int = field(
+        default=128,
+        metadata={
+            "help": (
+                "The maximum total input sequence length after tokenization. Sequences longer "
+                "than this will be truncated, sequences shorter will be padded."
+            )
+        },
+    )
+    doc_stride: int = field(
+        default=128,
+        metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."},
+    )
+    max_query_length: int = field(
+        default=64,
+        metadata={
+            "help": (
+                "The maximum number of tokens for the question. Questions longer than this will "
+                "be truncated to this length."
+            )
+        },
+    )
+    max_answer_length: int = field(
+        default=30,
+        metadata={
+            "help": (
+                "The maximum length of an answer that can be generated. This is needed because the start "
+                "and end predictions are not conditioned on one another."
+            )
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    version_2_with_negative: bool = field(
+        default=False, metadata={"help": "If true, the SQuAD examples contain some that do not have an answer."}
+    )
+    null_score_diff_threshold: float = field(
+        default=0.0, metadata={"help": "If null_score - best_non_null is greater than the threshold predict null."}
+    )
+    n_best_size: int = field(
+        default=20, metadata={"help": "If null_score - best_non_null is greater than the threshold predict null."}
+    )
+    lang_id: int = field(
+        default=0,
+        metadata={
+            "help": (
+                "language id of input for language-specific xlm models (see"
+                " tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)"
+            )
+        },
+    )
+    threads: int = field(default=1, metadata={"help": "multiple threads for converting example to features"})
+
+
+class Split(Enum):
+    train = "train"
+    dev = "dev"
+
+
+class SquadDataset(Dataset):
+    """
+    This will be superseded by a framework-agnostic approach soon.
+    """
+
+    args: SquadDataTrainingArguments
+    features: list[SquadFeatures]
+    mode: Split
+    is_language_sensitive: bool
+
+    def __init__(
+        self,
+        args: SquadDataTrainingArguments,
+        tokenizer: PreTrainedTokenizer,
+        limit_length: Optional[int] = None,
+        mode: Union[str, Split] = Split.train,
+        is_language_sensitive: bool = False,
+        cache_dir: Optional[str] = None,
+        dataset_format: str = "pt",
+    ):
+        self.args = args
+        self.is_language_sensitive = is_language_sensitive
+        self.processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
+        if isinstance(mode, str):
+            try:
+                mode = Split[mode]
+            except KeyError:
+                raise KeyError("mode is not a valid split name")
+        self.mode = mode
+        # Load data features from cache or dataset file
+        version_tag = "v2" if args.version_2_with_negative else "v1"
+        cached_features_file = os.path.join(
+            cache_dir if cache_dir is not None else args.data_dir,
+            f"cached_{mode.value}_{tokenizer.__class__.__name__}_{args.max_seq_length}_{version_tag}",
+        )
+
+        # Make sure only the first process in distributed training processes the dataset,
+        # and the others will use the cache.
+        lock_path = cached_features_file + ".lock"
+        with FileLock(lock_path):
+            if os.path.exists(cached_features_file) and not args.overwrite_cache:
+                start = time.time()
+                check_torch_load_is_safe()
+                self.old_features = torch.load(cached_features_file, weights_only=True)
+
+                # Legacy cache files have only features, while new cache files
+                # will have dataset and examples also.
+                self.features = self.old_features["features"]
+                self.dataset = self.old_features.get("dataset", None)
+                self.examples = self.old_features.get("examples", None)
+                logger.info(
+                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
+                )
+
+                if self.dataset is None or self.examples is None:
+                    logger.warning(
+                        f"Deleting cached file {cached_features_file} will allow dataset and examples to be cached in"
+                        " future run"
+                    )
+            else:
+                if mode == Split.dev:
+                    self.examples = self.processor.get_dev_examples(args.data_dir)
+                else:
+                    self.examples = self.processor.get_train_examples(args.data_dir)
+
+                self.features, self.dataset = squad_convert_examples_to_features(
+                    examples=self.examples,
+                    tokenizer=tokenizer,
+                    max_seq_length=args.max_seq_length,
+                    doc_stride=args.doc_stride,
+                    max_query_length=args.max_query_length,
+                    is_training=mode == Split.train,
+                    threads=args.threads,
+                    return_dataset=dataset_format,
+                )
+
+                start = time.time()
+                torch.save(
+                    {"features": self.features, "dataset": self.dataset, "examples": self.examples},
+                    cached_features_file,
+                )
+                # ^ This seems to take a lot of time so I want to investigate why and how we can improve.
+                logger.info(
+                    f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]"
+                )
+
+    def __len__(self):
+        return len(self.features)
+
+    def __getitem__(self, i) -> dict[str, torch.Tensor]:
+        # Convert to Tensors and build dataset
+        feature = self.features[i]
+
+        input_ids = torch.tensor(feature.input_ids, dtype=torch.long)
+        attention_mask = torch.tensor(feature.attention_mask, dtype=torch.long)
+        token_type_ids = torch.tensor(feature.token_type_ids, dtype=torch.long)
+        cls_index = torch.tensor(feature.cls_index, dtype=torch.long)
+        p_mask = torch.tensor(feature.p_mask, dtype=torch.float)
+        is_impossible = torch.tensor(feature.is_impossible, dtype=torch.float)
+
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "token_type_ids": token_type_ids,
+        }
+
+        if self.args.model_type in ["xlm", "roberta", "distilbert", "camembert"]:
+            del inputs["token_type_ids"]
+
+        if self.args.model_type in ["xlnet", "xlm"]:
+            inputs.update({"cls_index": cls_index, "p_mask": p_mask})
+            if self.args.version_2_with_negative:
+                inputs.update({"is_impossible": is_impossible})
+            if self.is_language_sensitive:
+                inputs.update({"langs": (torch.ones(input_ids.shape, dtype=torch.int64) * self.args.lang_id)})
+
+        if self.mode == Split.train:
+            start_positions = torch.tensor(feature.start_position, dtype=torch.long)
+            end_positions = torch.tensor(feature.end_position, dtype=torch.long)
+            inputs.update({"start_positions": start_positions, "end_positions": end_positions})
+
+        return inputs
diff --git a/venv/lib/python3.13/site-packages/transformers/data/metrics/__init__.py b/venv/lib/python3.13/site-packages/transformers/data/metrics/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebd0d17aa55bb4529820ce347f6275d38f6c0caa
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/data/metrics/__init__.py
@@ -0,0 +1,98 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+from ...utils import is_sklearn_available, requires_backends
+
+
+if is_sklearn_available():
+    from scipy.stats import pearsonr, spearmanr
+    from sklearn.metrics import f1_score, matthews_corrcoef
+
+
+DEPRECATION_WARNING = (
+    "This metric will be removed from the library soon, metrics should be handled with the 🤗 Evaluate "
+    "library. You can have a look at this example script for pointers: "
+    "https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py"
+)
+
+
+def simple_accuracy(preds, labels):
+    warnings.warn(DEPRECATION_WARNING, FutureWarning)
+    requires_backends(simple_accuracy, "sklearn")
+    return (preds == labels).mean()
+
+
+def acc_and_f1(preds, labels):
+    warnings.warn(DEPRECATION_WARNING, FutureWarning)
+    requires_backends(acc_and_f1, "sklearn")
+    acc = simple_accuracy(preds, labels)
+    f1 = f1_score(y_true=labels, y_pred=preds)
+    return {
+        "acc": acc,
+        "f1": f1,
+        "acc_and_f1": (acc + f1) / 2,
+    }
+
+
+def pearson_and_spearman(preds, labels):
+    warnings.warn(DEPRECATION_WARNING, FutureWarning)
+    requires_backends(pearson_and_spearman, "sklearn")
+    pearson_corr = pearsonr(preds, labels)[0]
+    spearman_corr = spearmanr(preds, labels)[0]
+    return {
+        "pearson": pearson_corr,
+        "spearmanr": spearman_corr,
+        "corr": (pearson_corr + spearman_corr) / 2,
+    }
+
+
+def glue_compute_metrics(task_name, preds, labels):
+    warnings.warn(DEPRECATION_WARNING, FutureWarning)
+    requires_backends(glue_compute_metrics, "sklearn")
+    assert len(preds) == len(labels), f"Predictions and labels have mismatched lengths {len(preds)} and {len(labels)}"
+    if task_name == "cola":
+        return {"mcc": matthews_corrcoef(labels, preds)}
+    elif task_name == "sst-2":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "mrpc":
+        return acc_and_f1(preds, labels)
+    elif task_name == "sts-b":
+        return pearson_and_spearman(preds, labels)
+    elif task_name == "qqp":
+        return acc_and_f1(preds, labels)
+    elif task_name == "mnli":
+        return {"mnli/acc": simple_accuracy(preds, labels)}
+    elif task_name == "mnli-mm":
+        return {"mnli-mm/acc": simple_accuracy(preds, labels)}
+    elif task_name == "qnli":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "rte":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "wnli":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "hans":
+        return {"acc": simple_accuracy(preds, labels)}
+    else:
+        raise KeyError(task_name)
+
+
+def xnli_compute_metrics(task_name, preds, labels):
+    warnings.warn(DEPRECATION_WARNING, FutureWarning)
+    requires_backends(xnli_compute_metrics, "sklearn")
+    if len(preds) != len(labels):
+        raise ValueError(f"Predictions and labels have mismatched lengths {len(preds)} and {len(labels)}")
+    if task_name == "xnli":
+        return {"acc": simple_accuracy(preds, labels)}
+    else:
+        raise KeyError(task_name)
diff --git a/venv/lib/python3.13/site-packages/transformers/data/metrics/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/data/metrics/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c1bdcb4764b6565f8aa31dbc89c29910466d9aad
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/data/metrics/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/data/metrics/__pycache__/squad_metrics.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/data/metrics/__pycache__/squad_metrics.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab7d31f41414c2ecb587a659c608001f18ce8e2b
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/data/metrics/__pycache__/squad_metrics.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/data/metrics/squad_metrics.py b/venv/lib/python3.13/site-packages/transformers/data/metrics/squad_metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ffc025b65a0451523004df12f5a4ae5e9d17b9a
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/data/metrics/squad_metrics.py
@@ -0,0 +1,779 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was modified by XLNet authors to
+update `find_best_threshold` scripts for SQuAD V2.0
+
+In addition to basic functionality, we also compute additional statistics and plot precision-recall curves if an
+additional na_prob.json file is provided. This file is expected to map question ID's to the model's predicted
+probability that a question is unanswerable.
+"""
+
+import collections
+import json
+import math
+import re
+import string
+
+from ...models.bert import BasicTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+
+    def remove_articles(text):
+        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+        return re.sub(regex, " ", text)
+
+    def white_space_fix(text):
+        return " ".join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def get_tokens(s):
+    if not s:
+        return []
+    return normalize_answer(s).split()
+
+
+def compute_exact(a_gold, a_pred):
+    return int(normalize_answer(a_gold) == normalize_answer(a_pred))
+
+
+def compute_f1(a_gold, a_pred):
+    gold_toks = get_tokens(a_gold)
+    pred_toks = get_tokens(a_pred)
+    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
+    num_same = sum(common.values())
+    if len(gold_toks) == 0 or len(pred_toks) == 0:
+        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
+        return int(gold_toks == pred_toks)
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(pred_toks)
+    recall = 1.0 * num_same / len(gold_toks)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+
+
+def get_raw_scores(examples, preds):
+    """
+    Computes the exact and f1 scores from the examples and the model predictions
+    """
+    exact_scores = {}
+    f1_scores = {}
+
+    for example in examples:
+        qas_id = example.qas_id
+        gold_answers = [answer["text"] for answer in example.answers if normalize_answer(answer["text"])]
+
+        if not gold_answers:
+            # For unanswerable questions, only correct answer is empty string
+            gold_answers = [""]
+
+        if qas_id not in preds:
+            print(f"Missing prediction for {qas_id}")
+            continue
+
+        prediction = preds[qas_id]
+        exact_scores[qas_id] = max(compute_exact(a, prediction) for a in gold_answers)
+        f1_scores[qas_id] = max(compute_f1(a, prediction) for a in gold_answers)
+
+    return exact_scores, f1_scores
+
+
+def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
+    new_scores = {}
+    for qid, s in scores.items():
+        pred_na = na_probs[qid] > na_prob_thresh
+        if pred_na:
+            new_scores[qid] = float(not qid_to_has_ans[qid])
+        else:
+            new_scores[qid] = s
+    return new_scores
+
+
+def make_eval_dict(exact_scores, f1_scores, qid_list=None):
+    if not qid_list:
+        total = len(exact_scores)
+        return collections.OrderedDict(
+            [
+                ("exact", 100.0 * sum(exact_scores.values()) / total),
+                ("f1", 100.0 * sum(f1_scores.values()) / total),
+                ("total", total),
+            ]
+        )
+    else:
+        total = len(qid_list)
+        return collections.OrderedDict(
+            [
+                ("exact", 100.0 * sum(exact_scores[k] for k in qid_list) / total),
+                ("f1", 100.0 * sum(f1_scores[k] for k in qid_list) / total),
+                ("total", total),
+            ]
+        )
+
+
+def merge_eval(main_eval, new_eval, prefix):
+    for k in new_eval:
+        main_eval[f"{prefix}_{k}"] = new_eval[k]
+
+
+def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
+    num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
+    cur_score = num_no_ans
+    best_score = cur_score
+    best_thresh = 0.0
+    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+    for qid in qid_list:
+        if qid not in scores:
+            continue
+        if qid_to_has_ans[qid]:
+            diff = scores[qid]
+        else:
+            if preds[qid]:
+                diff = -1
+            else:
+                diff = 0
+        cur_score += diff
+        if cur_score > best_score:
+            best_score = cur_score
+            best_thresh = na_probs[qid]
+
+    has_ans_score, has_ans_cnt = 0, 0
+    for qid in qid_list:
+        if not qid_to_has_ans[qid]:
+            continue
+        has_ans_cnt += 1
+
+        if qid not in scores:
+            continue
+        has_ans_score += scores[qid]
+
+    return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt
+
+
+def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
+    best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans)
+    best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans)
+    main_eval["best_exact"] = best_exact
+    main_eval["best_exact_thresh"] = exact_thresh
+    main_eval["best_f1"] = best_f1
+    main_eval["best_f1_thresh"] = f1_thresh
+    main_eval["has_ans_exact"] = has_ans_exact
+    main_eval["has_ans_f1"] = has_ans_f1
+
+
+def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
+    num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
+    cur_score = num_no_ans
+    best_score = cur_score
+    best_thresh = 0.0
+    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+    for _, qid in enumerate(qid_list):
+        if qid not in scores:
+            continue
+        if qid_to_has_ans[qid]:
+            diff = scores[qid]
+        else:
+            if preds[qid]:
+                diff = -1
+            else:
+                diff = 0
+        cur_score += diff
+        if cur_score > best_score:
+            best_score = cur_score
+            best_thresh = na_probs[qid]
+    return 100.0 * best_score / len(scores), best_thresh
+
+
+def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
+    best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
+    best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
+
+    main_eval["best_exact"] = best_exact
+    main_eval["best_exact_thresh"] = exact_thresh
+    main_eval["best_f1"] = best_f1
+    main_eval["best_f1_thresh"] = f1_thresh
+
+
+def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_threshold=1.0):
+    qas_id_to_has_answer = {example.qas_id: bool(example.answers) for example in examples}
+    has_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if has_answer]
+    no_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if not has_answer]
+
+    if no_answer_probs is None:
+        no_answer_probs = dict.fromkeys(preds, 0.0)
+
+    exact, f1 = get_raw_scores(examples, preds)
+
+    exact_threshold = apply_no_ans_threshold(
+        exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold
+    )
+    f1_threshold = apply_no_ans_threshold(f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold)
+
+    evaluation = make_eval_dict(exact_threshold, f1_threshold)
+
+    if has_answer_qids:
+        has_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=has_answer_qids)
+        merge_eval(evaluation, has_ans_eval, "HasAns")
+
+    if no_answer_qids:
+        no_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=no_answer_qids)
+        merge_eval(evaluation, no_ans_eval, "NoAns")
+
+    if no_answer_probs:
+        find_all_best_thresh(evaluation, preds, exact, f1, no_answer_probs, qas_id_to_has_answer)
+
+    return evaluation
+
+
+def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
+    """Project the tokenized prediction back to the original text."""
+
+    # When we created the data, we kept track of the alignment between original
+    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
+    # now `orig_text` contains the span of our original text corresponding to the
+    # span that we predicted.
+    #
+    # However, `orig_text` may contain extra characters that we don't want in
+    # our prediction.
+    #
+    # For example, let's say:
+    #   pred_text = steve smith
+    #   orig_text = Steve Smith's
+    #
+    # We don't want to return `orig_text` because it contains the extra "'s".
+    #
+    # We don't want to return `pred_text` because it's already been normalized
+    # (the SQuAD eval script also does punctuation stripping/lower casing but
+    # our tokenizer does additional normalization like stripping accent
+    # characters).
+    #
+    # What we really want to return is "Steve Smith".
+    #
+    # Therefore, we have to apply a semi-complicated alignment heuristic between
+    # `pred_text` and `orig_text` to get a character-to-character alignment. This
+    # can fail in certain cases in which case we just return `orig_text`.
+
+    def _strip_spaces(text):
+        ns_chars = []
+        ns_to_s_map = collections.OrderedDict()
+        for i, c in enumerate(text):
+            if c == " ":
+                continue
+            ns_to_s_map[len(ns_chars)] = i
+            ns_chars.append(c)
+        ns_text = "".join(ns_chars)
+        return (ns_text, ns_to_s_map)
+
+    # We first tokenize `orig_text`, strip whitespace from the result
+    # and `pred_text`, and check if they are the same length. If they are
+    # NOT the same length, the heuristic has failed. If they are the same
+    # length, we assume the characters are one-to-one aligned.
+    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+
+    tok_text = " ".join(tokenizer.tokenize(orig_text))
+
+    start_position = tok_text.find(pred_text)
+    if start_position == -1:
+        if verbose_logging:
+            logger.info(f"Unable to find text: '{pred_text}' in '{orig_text}'")
+        return orig_text
+    end_position = start_position + len(pred_text) - 1
+
+    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+
+    if len(orig_ns_text) != len(tok_ns_text):
+        if verbose_logging:
+            logger.info(f"Length not equal after stripping spaces: '{orig_ns_text}' vs '{tok_ns_text}'")
+        return orig_text
+
+    # We then project the characters in `pred_text` back to `orig_text` using
+    # the character-to-character alignment.
+    tok_s_to_ns_map = {}
+    for i, tok_index in tok_ns_to_s_map.items():
+        tok_s_to_ns_map[tok_index] = i
+
+    orig_start_position = None
+    if start_position in tok_s_to_ns_map:
+        ns_start_position = tok_s_to_ns_map[start_position]
+        if ns_start_position in orig_ns_to_s_map:
+            orig_start_position = orig_ns_to_s_map[ns_start_position]
+
+    if orig_start_position is None:
+        if verbose_logging:
+            logger.info("Couldn't map start position")
+        return orig_text
+
+    orig_end_position = None
+    if end_position in tok_s_to_ns_map:
+        ns_end_position = tok_s_to_ns_map[end_position]
+        if ns_end_position in orig_ns_to_s_map:
+            orig_end_position = orig_ns_to_s_map[ns_end_position]
+
+    if orig_end_position is None:
+        if verbose_logging:
+            logger.info("Couldn't map end position")
+        return orig_text
+
+    output_text = orig_text[orig_start_position : (orig_end_position + 1)]
+    return output_text
+
+
+def _get_best_indexes(logits, n_best_size):
+    """Get the n-best logits from a list."""
+    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
+
+    best_indexes = []
+    for i in range(len(index_and_score)):
+        if i >= n_best_size:
+            break
+        best_indexes.append(index_and_score[i][0])
+    return best_indexes
+
+
+def _compute_softmax(scores):
+    """Compute softmax probability over raw logits."""
+    if not scores:
+        return []
+
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
+
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x
+
+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs
+
+
+def compute_predictions_logits(
+    all_examples,
+    all_features,
+    all_results,
+    n_best_size,
+    max_answer_length,
+    do_lower_case,
+    output_prediction_file,
+    output_nbest_file,
+    output_null_log_odds_file,
+    verbose_logging,
+    version_2_with_negative,
+    null_score_diff_threshold,
+    tokenizer,
+):
+    """Write final predictions to the json file and log-odds of null if needed."""
+    if output_prediction_file:
+        logger.info(f"Writing predictions to: {output_prediction_file}")
+    if output_nbest_file:
+        logger.info(f"Writing nbest to: {output_nbest_file}")
+    if output_null_log_odds_file and version_2_with_negative:
+        logger.info(f"Writing null_log_odds to: {output_null_log_odds_file}")
+
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
+    )
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+
+    for example_index, example in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+        min_null_feature_index = 0  # the paragraph slice with min null score
+        null_start_logit = 0  # the start logit at the slice with min null score
+        null_end_logit = 0  # the end logit at the slice with min null score
+        for feature_index, feature in enumerate(features):
+            result = unique_id_to_result[feature.unique_id]
+            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
+            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
+            # if we could have irrelevant answers, get the min score of irrelevant
+            if version_2_with_negative:
+                feature_null_score = result.start_logits[0] + result.end_logits[0]
+                if feature_null_score < score_null:
+                    score_null = feature_null_score
+                    min_null_feature_index = feature_index
+                    null_start_logit = result.start_logits[0]
+                    null_end_logit = result.end_logits[0]
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= len(feature.tokens):
+                        continue
+                    if end_index >= len(feature.tokens):
+                        continue
+                    if start_index not in feature.token_to_orig_map:
+                        continue
+                    if end_index not in feature.token_to_orig_map:
+                        continue
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_logit=result.start_logits[start_index],
+                            end_logit=result.end_logits[end_index],
+                        )
+                    )
+        if version_2_with_negative:
+            prelim_predictions.append(
+                _PrelimPrediction(
+                    feature_index=min_null_feature_index,
+                    start_index=0,
+                    end_index=0,
+                    start_logit=null_start_logit,
+                    end_logit=null_end_logit,
+                )
+            )
+        prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
+
+        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+            "NbestPrediction", ["text", "start_logit", "end_logit"]
+        )
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+            if pred.start_index > 0:  # this is a non-null prediction
+                tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
+                orig_doc_start = feature.token_to_orig_map[pred.start_index]
+                orig_doc_end = feature.token_to_orig_map[pred.end_index]
+                orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
+
+                tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
+
+                # tok_text = " ".join(tok_tokens)
+                #
+                # # De-tokenize WordPieces that have been split off.
+                # tok_text = tok_text.replace(" ##", "")
+                # tok_text = tok_text.replace("##", "")
+
+                # Clean whitespace
+                tok_text = tok_text.strip()
+                tok_text = " ".join(tok_text.split())
+                orig_text = " ".join(orig_tokens)
+
+                final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
+                if final_text in seen_predictions:
+                    continue
+
+                seen_predictions[final_text] = True
+            else:
+                final_text = ""
+                seen_predictions[final_text] = True
+
+            nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
+        # if we didn't include the empty option in the n-best, include it
+        if version_2_with_negative:
+            if "" not in seen_predictions:
+                nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit))
+
+            # In very rare edge cases we could only have single null prediction.
+            # So we just create a nonce prediction in this case to avoid failure.
+            if len(nbest) == 1:
+                nbest.insert(0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+
+        if len(nbest) < 1:
+            raise ValueError("No valid predictions")
+
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_logit + entry.end_logit)
+            if not best_non_null_entry:
+                if entry.text:
+                    best_non_null_entry = entry
+
+        probs = _compute_softmax(total_scores)
+
+        nbest_json = []
+        for i, entry in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_logit"] = entry.start_logit
+            output["end_logit"] = entry.end_logit
+            nbest_json.append(output)
+
+        if len(nbest_json) < 1:
+            raise ValueError("No valid predictions")
+
+        if not version_2_with_negative:
+            all_predictions[example.qas_id] = nbest_json[0]["text"]
+        else:
+            # predict "" iff the null score - the score of best non-null > threshold
+            score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
+            scores_diff_json[example.qas_id] = score_diff
+            if score_diff > null_score_diff_threshold:
+                all_predictions[example.qas_id] = ""
+            else:
+                all_predictions[example.qas_id] = best_non_null_entry.text
+        all_nbest_json[example.qas_id] = nbest_json
+
+    if output_prediction_file:
+        with open(output_prediction_file, "w") as writer:
+            writer.write(json.dumps(all_predictions, indent=4) + "\n")
+
+    if output_nbest_file:
+        with open(output_nbest_file, "w") as writer:
+            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+
+    if output_null_log_odds_file and version_2_with_negative:
+        with open(output_null_log_odds_file, "w") as writer:
+            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    return all_predictions
+
+
+def compute_predictions_log_probs(
+    all_examples,
+    all_features,
+    all_results,
+    n_best_size,
+    max_answer_length,
+    output_prediction_file,
+    output_nbest_file,
+    output_null_log_odds_file,
+    start_n_top,
+    end_n_top,
+    version_2_with_negative,
+    tokenizer,
+    verbose_logging,
+):
+    """
+    XLNet write prediction logic (more complex than Bert's). Write final predictions to the json file and log-odds of
+    null if needed.
+
+    Requires utils_squad_evaluate.py
+    """
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"]
+    )
+
+    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]
+    )
+
+    logger.info(f"Writing predictions to: {output_prediction_file}")
+
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+
+    for example_index, example in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+
+        for feature_index, feature in enumerate(features):
+            result = unique_id_to_result[feature.unique_id]
+
+            cur_null_score = result.cls_logits
+
+            # if we could have irrelevant answers, get the min score of irrelevant
+            score_null = min(score_null, cur_null_score)
+
+            for i in range(start_n_top):
+                for j in range(end_n_top):
+                    start_log_prob = result.start_logits[i]
+                    start_index = result.start_top_index[i]
+
+                    j_index = i * end_n_top + j
+
+                    end_log_prob = result.end_logits[j_index]
+                    end_index = result.end_top_index[j_index]
+
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= feature.paragraph_len - 1:
+                        continue
+                    if end_index >= feature.paragraph_len - 1:
+                        continue
+
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_log_prob=start_log_prob,
+                            end_log_prob=end_log_prob,
+                        )
+                    )
+
+        prelim_predictions = sorted(
+            prelim_predictions, key=lambda x: (x.start_log_prob + x.end_log_prob), reverse=True
+        )
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+
+            # XLNet un-tokenizer
+            # Let's keep it simple for now and see if we need all this later.
+            #
+            # tok_start_to_orig_index = feature.tok_start_to_orig_index
+            # tok_end_to_orig_index = feature.tok_end_to_orig_index
+            # start_orig_pos = tok_start_to_orig_index[pred.start_index]
+            # end_orig_pos = tok_end_to_orig_index[pred.end_index]
+            # paragraph_text = example.paragraph_text
+            # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip()
+
+            # Previously used Bert untokenizer
+            tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
+            orig_doc_start = feature.token_to_orig_map[pred.start_index]
+            orig_doc_end = feature.token_to_orig_map[pred.end_index]
+            orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
+            tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
+
+            # Clean whitespace
+            tok_text = tok_text.strip()
+            tok_text = " ".join(tok_text.split())
+            orig_text = " ".join(orig_tokens)
+
+            if hasattr(tokenizer, "do_lower_case"):
+                do_lower_case = tokenizer.do_lower_case
+            else:
+                do_lower_case = tokenizer.do_lowercase_and_remove_accent
+
+            final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
+
+            if final_text in seen_predictions:
+                continue
+
+            seen_predictions[final_text] = True
+
+            nbest.append(
+                _NbestPrediction(text=final_text, start_log_prob=pred.start_log_prob, end_log_prob=pred.end_log_prob)
+            )
+
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(_NbestPrediction(text="", start_log_prob=-1e6, end_log_prob=-1e6))
+
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_log_prob + entry.end_log_prob)
+            if not best_non_null_entry:
+                best_non_null_entry = entry
+
+        probs = _compute_softmax(total_scores)
+
+        nbest_json = []
+        for i, entry in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_log_prob"] = entry.start_log_prob
+            output["end_log_prob"] = entry.end_log_prob
+            nbest_json.append(output)
+
+        if len(nbest_json) < 1:
+            raise ValueError("No valid predictions")
+        if best_non_null_entry is None:
+            raise ValueError("No valid predictions")
+
+        score_diff = score_null
+        scores_diff_json[example.qas_id] = score_diff
+        # note(zhiliny): always predict best_non_null_entry
+        # and the evaluation script will search for the best threshold
+        all_predictions[example.qas_id] = best_non_null_entry.text
+
+        all_nbest_json[example.qas_id] = nbest_json
+
+    with open(output_prediction_file, "w") as writer:
+        writer.write(json.dumps(all_predictions, indent=4) + "\n")
+
+    with open(output_nbest_file, "w") as writer:
+        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+
+    if version_2_with_negative:
+        with open(output_null_log_odds_file, "w") as writer:
+            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    return all_predictions
diff --git a/venv/lib/python3.13/site-packages/transformers/data/processors/__init__.py b/venv/lib/python3.13/site-packages/transformers/data/processors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a26ab5776d74715428b10c4d9cd943e53b253785
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/data/processors/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels
+from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features
+from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor
+from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
diff --git a/venv/lib/python3.13/site-packages/transformers/data/processors/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/data/processors/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..79adcc6a95567a2de8e0d092cc35345e61aea574
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/data/processors/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/data/processors/__pycache__/glue.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/data/processors/__pycache__/glue.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ff71958ca133d7cb6f2f43a1e39f6a47e8caf6fb
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/data/processors/__pycache__/glue.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/data/processors/__pycache__/squad.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/data/processors/__pycache__/squad.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..78483fcfd3e98467fac9b356ec57e09da688e276
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/data/processors/__pycache__/squad.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/data/processors/__pycache__/utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/data/processors/__pycache__/utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4570ffb5bf54bab75e7d11821e2fd6a1c877f41a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/data/processors/__pycache__/utils.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/data/processors/__pycache__/xnli.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/data/processors/__pycache__/xnli.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e9580c9b814e6a3181e03c511fe5536eb2a61bb3
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/data/processors/__pycache__/xnli.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/data/processors/glue.py b/venv/lib/python3.13/site-packages/transformers/data/processors/glue.py
new file mode 100644
index 0000000000000000000000000000000000000000..e005c9bcda13d15bc3aa32a50c79941166d0ba28
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/data/processors/glue.py
@@ -0,0 +1,643 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GLUE processors and helpers"""
+
+import os
+import warnings
+from dataclasses import asdict
+from enum import Enum
+from typing import Optional, Union
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import is_tf_available, logging
+from .utils import DataProcessor, InputExample, InputFeatures
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+logger = logging.get_logger(__name__)
+
+DEPRECATION_WARNING = (
+    "This {0} will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets "
+    "library. You can have a look at this example script for pointers: "
+    "https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py"
+)
+
+
+def glue_convert_examples_to_features(
+    examples: Union[list[InputExample], "tf.data.Dataset"],
+    tokenizer: PreTrainedTokenizer,
+    max_length: Optional[int] = None,
+    task=None,
+    label_list=None,
+    output_mode=None,
+):
+    """
+    Loads a data file into a list of `InputFeatures`
+
+    Args:
+        examples: List of `InputExamples` or `tf.data.Dataset` containing the examples.
+        tokenizer: Instance of a tokenizer that will tokenize the examples
+        max_length: Maximum example length. Defaults to the tokenizer's max_len
+        task: GLUE task
+        label_list: List of labels. Can be obtained from the processor using the `processor.get_labels()` method
+        output_mode: String indicating the output mode. Either `regression` or `classification`
+
+    Returns:
+        If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the task-specific
+        features. If the input is a list of `InputExamples`, will return a list of task-specific `InputFeatures` which
+        can be fed to the model.
+
+    """
+    warnings.warn(DEPRECATION_WARNING.format("function"), FutureWarning)
+    if is_tf_available() and isinstance(examples, tf.data.Dataset):
+        if task is None:
+            raise ValueError("When calling glue_convert_examples_to_features from TF, the task parameter is required.")
+        return _tf_glue_convert_examples_to_features(examples, tokenizer, max_length=max_length, task=task)
+    return _glue_convert_examples_to_features(
+        examples, tokenizer, max_length=max_length, task=task, label_list=label_list, output_mode=output_mode
+    )
+
+
+if is_tf_available():
+
+    def _tf_glue_convert_examples_to_features(
+        examples: tf.data.Dataset,
+        tokenizer: PreTrainedTokenizer,
+        task=str,
+        max_length: Optional[int] = None,
+    ) -> tf.data.Dataset:
+        """
+        Returns:
+            A `tf.data.Dataset` containing the task-specific features.
+
+        """
+        processor = glue_processors[task]()
+        examples = [processor.tfds_map(processor.get_example_from_tensor_dict(example)) for example in examples]
+        features = glue_convert_examples_to_features(examples, tokenizer, max_length=max_length, task=task)
+        label_type = tf.float32 if task == "sts-b" else tf.int64
+
+        def gen():
+            for ex in features:
+                d = {k: v for k, v in asdict(ex).items() if v is not None}
+                label = d.pop("label")
+                yield (d, label)
+
+        input_names = tokenizer.model_input_names
+
+        return tf.data.Dataset.from_generator(
+            gen,
+            (dict.fromkeys(input_names, tf.int32), label_type),
+            ({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([])),
+        )
+
+
+def _glue_convert_examples_to_features(
+    examples: list[InputExample],
+    tokenizer: PreTrainedTokenizer,
+    max_length: Optional[int] = None,
+    task=None,
+    label_list=None,
+    output_mode=None,
+):
+    if max_length is None:
+        max_length = tokenizer.model_max_length
+
+    if task is not None:
+        processor = glue_processors[task]()
+        if label_list is None:
+            label_list = processor.get_labels()
+            logger.info(f"Using label list {label_list} for task {task}")
+        if output_mode is None:
+            output_mode = glue_output_modes[task]
+            logger.info(f"Using output mode {output_mode} for task {task}")
+
+    label_map = {label: i for i, label in enumerate(label_list)}
+
+    def label_from_example(example: InputExample) -> Union[int, float, None]:
+        if example.label is None:
+            return None
+        if output_mode == "classification":
+            return label_map[example.label]
+        elif output_mode == "regression":
+            return float(example.label)
+        raise KeyError(output_mode)
+
+    labels = [label_from_example(example) for example in examples]
+
+    batch_encoding = tokenizer(
+        [(example.text_a, example.text_b) for example in examples],
+        max_length=max_length,
+        padding="max_length",
+        truncation=True,
+    )
+
+    features = []
+    for i in range(len(examples)):
+        inputs = {k: batch_encoding[k][i] for k in batch_encoding}
+
+        feature = InputFeatures(**inputs, label=labels[i])
+        features.append(feature)
+
+    for i, example in enumerate(examples[:5]):
+        logger.info("*** Example ***")
+        logger.info(f"guid: {example.guid}")
+        logger.info(f"features: {features[i]}")
+
+    return features
+
+
+class OutputMode(Enum):
+    classification = "classification"
+    regression = "regression"
+
+
+class MrpcProcessor(DataProcessor):
+    """Processor for the MRPC data set (GLUE version)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["sentence1"].numpy().decode("utf-8"),
+            tensor_dict["sentence2"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        logger.info(f"LOOKING AT {os.path.join(data_dir, 'train.tsv')}")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training, dev and test sets."""
+        examples = []
+        for i, line in enumerate(lines):
+            if i == 0:
+                continue
+            guid = f"{set_type}-{i}"
+            text_a = line[3]
+            text_b = line[4]
+            label = None if set_type == "test" else line[0]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class MnliProcessor(DataProcessor):
+    """Processor for the MultiNLI data set (GLUE version)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["premise"].numpy().decode("utf-8"),
+            tensor_dict["hypothesis"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test_matched")
+
+    def get_labels(self):
+        """See base class."""
+        return ["contradiction", "entailment", "neutral"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training, dev and test sets."""
+        examples = []
+        for i, line in enumerate(lines):
+            if i == 0:
+                continue
+            guid = f"{set_type}-{line[0]}"
+            text_a = line[8]
+            text_b = line[9]
+            label = None if set_type.startswith("test") else line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class MnliMismatchedProcessor(MnliProcessor):
+    """Processor for the MultiNLI Mismatched data set (GLUE version)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_mismatched")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test_mismatched.tsv")), "test_mismatched")
+
+
+class ColaProcessor(DataProcessor):
+    """Processor for the CoLA data set (GLUE version)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["sentence"].numpy().decode("utf-8"),
+            None,
+            str(tensor_dict["label"].numpy()),
+        )
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training, dev and test sets."""
+        test_mode = set_type == "test"
+        if test_mode:
+            lines = lines[1:]
+        text_index = 1 if test_mode else 3
+        examples = []
+        for i, line in enumerate(lines):
+            guid = f"{set_type}-{i}"
+            text_a = line[text_index]
+            label = None if test_mode else line[1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+
+
+class Sst2Processor(DataProcessor):
+    """Processor for the SST-2 data set (GLUE version)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["sentence"].numpy().decode("utf-8"),
+            None,
+            str(tensor_dict["label"].numpy()),
+        )
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training, dev and test sets."""
+        examples = []
+        text_index = 1 if set_type == "test" else 0
+        for i, line in enumerate(lines):
+            if i == 0:
+                continue
+            guid = f"{set_type}-{i}"
+            text_a = line[text_index]
+            label = None if set_type == "test" else line[1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+
+
+class StsbProcessor(DataProcessor):
+    """Processor for the STS-B data set (GLUE version)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["sentence1"].numpy().decode("utf-8"),
+            tensor_dict["sentence2"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return [None]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training, dev and test sets."""
+        examples = []
+        for i, line in enumerate(lines):
+            if i == 0:
+                continue
+            guid = f"{set_type}-{line[0]}"
+            text_a = line[7]
+            text_b = line[8]
+            label = None if set_type == "test" else line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class QqpProcessor(DataProcessor):
+    """Processor for the QQP data set (GLUE version)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["question1"].numpy().decode("utf-8"),
+            tensor_dict["question2"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training, dev and test sets."""
+        test_mode = set_type == "test"
+        q1_index = 1 if test_mode else 3
+        q2_index = 2 if test_mode else 4
+        examples = []
+        for i, line in enumerate(lines):
+            if i == 0:
+                continue
+            guid = f"{set_type}-{line[0]}"
+            try:
+                text_a = line[q1_index]
+                text_b = line[q2_index]
+                label = None if test_mode else line[5]
+            except IndexError:
+                continue
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class QnliProcessor(DataProcessor):
+    """Processor for the QNLI data set (GLUE version)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["question"].numpy().decode("utf-8"),
+            tensor_dict["sentence"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["entailment", "not_entailment"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training, dev and test sets."""
+        examples = []
+        for i, line in enumerate(lines):
+            if i == 0:
+                continue
+            guid = f"{set_type}-{line[0]}"
+            text_a = line[1]
+            text_b = line[2]
+            label = None if set_type == "test" else line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class RteProcessor(DataProcessor):
+    """Processor for the RTE data set (GLUE version)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["sentence1"].numpy().decode("utf-8"),
+            tensor_dict["sentence2"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["entailment", "not_entailment"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training, dev and test sets."""
+        examples = []
+        for i, line in enumerate(lines):
+            if i == 0:
+                continue
+            guid = f"{set_type}-{line[0]}"
+            text_a = line[1]
+            text_b = line[2]
+            label = None if set_type == "test" else line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class WnliProcessor(DataProcessor):
+    """Processor for the WNLI data set (GLUE version)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["sentence1"].numpy().decode("utf-8"),
+            tensor_dict["sentence2"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training, dev and test sets."""
+        examples = []
+        for i, line in enumerate(lines):
+            if i == 0:
+                continue
+            guid = f"{set_type}-{line[0]}"
+            text_a = line[1]
+            text_b = line[2]
+            label = None if set_type == "test" else line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+glue_tasks_num_labels = {
+    "cola": 2,
+    "mnli": 3,
+    "mrpc": 2,
+    "sst-2": 2,
+    "sts-b": 1,
+    "qqp": 2,
+    "qnli": 2,
+    "rte": 2,
+    "wnli": 2,
+}
+
+glue_processors = {
+    "cola": ColaProcessor,
+    "mnli": MnliProcessor,
+    "mnli-mm": MnliMismatchedProcessor,
+    "mrpc": MrpcProcessor,
+    "sst-2": Sst2Processor,
+    "sts-b": StsbProcessor,
+    "qqp": QqpProcessor,
+    "qnli": QnliProcessor,
+    "rte": RteProcessor,
+    "wnli": WnliProcessor,
+}
+
+glue_output_modes = {
+    "cola": "classification",
+    "mnli": "classification",
+    "mnli-mm": "classification",
+    "mrpc": "classification",
+    "sst-2": "classification",
+    "sts-b": "regression",
+    "qqp": "classification",
+    "qnli": "classification",
+    "rte": "classification",
+    "wnli": "classification",
+}
diff --git a/venv/lib/python3.13/site-packages/transformers/data/processors/squad.py b/venv/lib/python3.13/site-packages/transformers/data/processors/squad.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f37eb01813308a0c850e55ac283f13ccf231f68
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/data/processors/squad.py
@@ -0,0 +1,845 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from functools import partial
+from multiprocessing import Pool, cpu_count
+from multiprocessing.pool import ThreadPool
+from typing import Optional
+
+import numpy as np
+from tqdm import tqdm
+
+from ...models.bert.tokenization_bert import whitespace_tokenize
+from ...tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy
+from ...utils import is_tf_available, is_torch_available, is_torch_hpu_available, logging
+from .utils import DataProcessor
+
+
+# Store the tokenizers which insert 2 separators tokens
+MULTI_SEP_TOKENS_TOKENIZERS_SET = {"roberta", "camembert", "bart", "mpnet"}
+
+
+if is_torch_available():
+    import torch
+    from torch.utils.data import TensorDataset
+
+if is_tf_available():
+    import tensorflow as tf
+
+logger = logging.get_logger(__name__)
+
+
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
+    """Returns tokenized answer spans that better match the annotated answer."""
+    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
+
+    for new_start in range(input_start, input_end + 1):
+        for new_end in range(input_end, new_start - 1, -1):
+            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
+            if text_span == tok_answer_text:
+                return (new_start, new_end)
+
+    return (input_start, input_end)
+
+
+def _check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token."""
+    best_score = None
+    best_span_index = None
+    for span_index, doc_span in enumerate(doc_spans):
+        end = doc_span.start + doc_span.length - 1
+        if position < doc_span.start:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span.start
+        num_right_context = end - position
+        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+
+    return cur_span_index == best_span_index
+
+
+def _new_check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token."""
+    # if len(doc_spans) == 1:
+    # return True
+    best_score = None
+    best_span_index = None
+    for span_index, doc_span in enumerate(doc_spans):
+        end = doc_span["start"] + doc_span["length"] - 1
+        if position < doc_span["start"]:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span["start"]
+        num_right_context = end - position
+        score = min(num_left_context, num_right_context) + 0.01 * doc_span["length"]
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+
+    return cur_span_index == best_span_index
+
+
+def _is_whitespace(c):
+    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+        return True
+    return False
+
+
+def squad_convert_example_to_features(
+    example, max_seq_length, doc_stride, max_query_length, padding_strategy, is_training
+):
+    features = []
+    if is_training and not example.is_impossible:
+        # Get start and end position
+        start_position = example.start_position
+        end_position = example.end_position
+
+        # If the answer cannot be found in the text, then skip this example.
+        actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)])
+        cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
+        if actual_text.find(cleaned_answer_text) == -1:
+            logger.warning(f"Could not find answer: '{actual_text}' vs. '{cleaned_answer_text}'")
+            return []
+
+    tok_to_orig_index = []
+    orig_to_tok_index = []
+    all_doc_tokens = []
+    for i, token in enumerate(example.doc_tokens):
+        orig_to_tok_index.append(len(all_doc_tokens))
+        if tokenizer.__class__.__name__ in [
+            "RobertaTokenizer",
+            "LongformerTokenizer",
+            "BartTokenizer",
+            "RobertaTokenizerFast",
+            "LongformerTokenizerFast",
+            "BartTokenizerFast",
+        ]:
+            sub_tokens = tokenizer.tokenize(token, add_prefix_space=True)
+        else:
+            sub_tokens = tokenizer.tokenize(token)
+        for sub_token in sub_tokens:
+            tok_to_orig_index.append(i)
+            all_doc_tokens.append(sub_token)
+
+    if is_training and not example.is_impossible:
+        tok_start_position = orig_to_tok_index[example.start_position]
+        if example.end_position < len(example.doc_tokens) - 1:
+            tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
+        else:
+            tok_end_position = len(all_doc_tokens) - 1
+
+        (tok_start_position, tok_end_position) = _improve_answer_span(
+            all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text
+        )
+
+    spans = []
+
+    truncated_query = tokenizer.encode(
+        example.question_text, add_special_tokens=False, truncation=True, max_length=max_query_length
+    )
+
+    # Tokenizers who insert 2 SEP tokens in-between <context> & <question> need to have special handling
+    # in the way they compute mask of added tokens.
+    tokenizer_type = type(tokenizer).__name__.replace("Tokenizer", "").lower()
+    sequence_added_tokens = (
+        tokenizer.model_max_length - tokenizer.max_len_single_sentence + 1
+        if tokenizer_type in MULTI_SEP_TOKENS_TOKENIZERS_SET
+        else tokenizer.model_max_length - tokenizer.max_len_single_sentence
+    )
+    sequence_pair_added_tokens = tokenizer.model_max_length - tokenizer.max_len_sentences_pair
+
+    span_doc_tokens = all_doc_tokens
+    while len(spans) * doc_stride < len(all_doc_tokens):
+        # Define the side we want to truncate / pad and the text/pair sorting
+        if tokenizer.padding_side == "right":
+            texts = truncated_query
+            pairs = span_doc_tokens
+            truncation = TruncationStrategy.ONLY_SECOND.value
+        else:
+            texts = span_doc_tokens
+            pairs = truncated_query
+            truncation = TruncationStrategy.ONLY_FIRST.value
+
+        encoded_dict = tokenizer.encode_plus(  # TODO(thom) update this logic
+            texts,
+            pairs,
+            truncation=truncation,
+            padding=padding_strategy,
+            max_length=max_seq_length,
+            return_overflowing_tokens=True,
+            stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
+            return_token_type_ids=True,
+        )
+
+        paragraph_len = min(
+            len(all_doc_tokens) - len(spans) * doc_stride,
+            max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
+        )
+
+        if tokenizer.pad_token_id in encoded_dict["input_ids"]:
+            if tokenizer.padding_side == "right":
+                non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)]
+            else:
+                last_padding_id_position = (
+                    len(encoded_dict["input_ids"]) - 1 - encoded_dict["input_ids"][::-1].index(tokenizer.pad_token_id)
+                )
+                non_padded_ids = encoded_dict["input_ids"][last_padding_id_position + 1 :]
+
+        else:
+            non_padded_ids = encoded_dict["input_ids"]
+
+        tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)
+
+        token_to_orig_map = {}
+        for i in range(paragraph_len):
+            index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i
+            token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]
+
+        encoded_dict["paragraph_len"] = paragraph_len
+        encoded_dict["tokens"] = tokens
+        encoded_dict["token_to_orig_map"] = token_to_orig_map
+        encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens
+        encoded_dict["token_is_max_context"] = {}
+        encoded_dict["start"] = len(spans) * doc_stride
+        encoded_dict["length"] = paragraph_len
+
+        spans.append(encoded_dict)
+
+        if "overflowing_tokens" not in encoded_dict or (
+            "overflowing_tokens" in encoded_dict and len(encoded_dict["overflowing_tokens"]) == 0
+        ):
+            break
+        span_doc_tokens = encoded_dict["overflowing_tokens"]
+
+    for doc_span_index in range(len(spans)):
+        for j in range(spans[doc_span_index]["paragraph_len"]):
+            is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
+            index = (
+                j
+                if tokenizer.padding_side == "left"
+                else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
+            )
+            spans[doc_span_index]["token_is_max_context"][index] = is_max_context
+
+    for span in spans:
+        # Identify the position of the CLS token
+        cls_index = span["input_ids"].index(tokenizer.cls_token_id)
+
+        # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
+        # Original TF implementation also keep the classification token (set to 0)
+        p_mask = np.ones_like(span["token_type_ids"])
+        if tokenizer.padding_side == "right":
+            p_mask[len(truncated_query) + sequence_added_tokens :] = 0
+        else:
+            p_mask[-len(span["tokens"]) : -(len(truncated_query) + sequence_added_tokens)] = 0
+
+        pad_token_indices = np.where(np.atleast_1d(span["input_ids"] == tokenizer.pad_token_id))
+        special_token_indices = np.asarray(
+            tokenizer.get_special_tokens_mask(span["input_ids"], already_has_special_tokens=True)
+        ).nonzero()
+
+        p_mask[pad_token_indices] = 1
+        p_mask[special_token_indices] = 1
+
+        # Set the cls index to 0: the CLS index can be used for impossible answers
+        p_mask[cls_index] = 0
+
+        span_is_impossible = example.is_impossible
+        start_position = 0
+        end_position = 0
+        if is_training and not span_is_impossible:
+            # For training, if our document chunk does not contain an annotation
+            # we throw it out, since there is nothing to predict.
+            doc_start = span["start"]
+            doc_end = span["start"] + span["length"] - 1
+            out_of_span = False
+
+            if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
+                out_of_span = True
+
+            if out_of_span:
+                start_position = cls_index
+                end_position = cls_index
+                span_is_impossible = True
+            else:
+                if tokenizer.padding_side == "left":
+                    doc_offset = 0
+                else:
+                    doc_offset = len(truncated_query) + sequence_added_tokens
+
+                start_position = tok_start_position - doc_start + doc_offset
+                end_position = tok_end_position - doc_start + doc_offset
+        features.append(
+            SquadFeatures(
+                span["input_ids"],
+                span["attention_mask"],
+                span["token_type_ids"],
+                cls_index,
+                p_mask.tolist(),
+                example_index=0,  # Can not set unique_id and example_index here. They will be set after multiple processing.
+                unique_id=0,
+                paragraph_len=span["paragraph_len"],
+                token_is_max_context=span["token_is_max_context"],
+                tokens=span["tokens"],
+                token_to_orig_map=span["token_to_orig_map"],
+                start_position=start_position,
+                end_position=end_position,
+                is_impossible=span_is_impossible,
+                qas_id=example.qas_id,
+            )
+        )
+    return features
+
+
+def squad_convert_example_to_features_init(tokenizer_for_convert: PreTrainedTokenizerBase):
+    global tokenizer
+    tokenizer = tokenizer_for_convert
+
+
+def squad_convert_examples_to_features(
+    examples,
+    tokenizer,
+    max_seq_length,
+    doc_stride,
+    max_query_length,
+    is_training,
+    padding_strategy="max_length",
+    return_dataset=False,
+    threads=1,
+    tqdm_enabled=True,
+):
+    """
+    Converts a list of examples into a list of features that can be directly given as input to a model. It is
+    model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
+
+    Args:
+        examples: list of [`~data.processors.squad.SquadExample`]
+        tokenizer: an instance of a child of [`PreTrainedTokenizer`]
+        max_seq_length: The maximum sequence length of the inputs.
+        doc_stride: The stride used when the context is too large and is split across several features.
+        max_query_length: The maximum length of the query.
+        is_training: whether to create features for model evaluation or model training.
+        padding_strategy: Default to "max_length". Which padding strategy to use
+        return_dataset: Default False. Either 'pt' or 'tf'.
+            if 'pt': returns a torch.data.TensorDataset, if 'tf': returns a tf.data.Dataset
+        threads: multiple processing threads.
+
+
+    Returns:
+        list of [`~data.processors.squad.SquadFeatures`]
+
+    Example:
+
+    ```python
+    processor = SquadV2Processor()
+    examples = processor.get_dev_examples(data_dir)
+
+    features = squad_convert_examples_to_features(
+        examples=examples,
+        tokenizer=tokenizer,
+        max_seq_length=args.max_seq_length,
+        doc_stride=args.doc_stride,
+        max_query_length=args.max_query_length,
+        is_training=not evaluate,
+    )
+    ```"""
+
+    threads = min(threads, cpu_count())
+    pool_cls = ThreadPool if is_torch_hpu_available() else Pool
+    with pool_cls(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
+        annotate_ = partial(
+            squad_convert_example_to_features,
+            max_seq_length=max_seq_length,
+            doc_stride=doc_stride,
+            max_query_length=max_query_length,
+            padding_strategy=padding_strategy,
+            is_training=is_training,
+        )
+        features = list(
+            tqdm(
+                p.imap(annotate_, examples, chunksize=32),
+                total=len(examples),
+                desc="convert squad examples to features",
+                disable=not tqdm_enabled,
+            )
+        )
+
+    new_features = []
+    unique_id = 1000000000
+    example_index = 0
+    for example_features in tqdm(
+        features, total=len(features), desc="add example index and unique id", disable=not tqdm_enabled
+    ):
+        if not example_features:
+            continue
+        for example_feature in example_features:
+            example_feature.example_index = example_index
+            example_feature.unique_id = unique_id
+            new_features.append(example_feature)
+            unique_id += 1
+        example_index += 1
+    features = new_features
+    del new_features
+    if return_dataset == "pt":
+        if not is_torch_available():
+            raise RuntimeError("PyTorch must be installed to return a PyTorch dataset.")
+
+        # Convert to Tensors and build dataset
+        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+        all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+        all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
+        all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
+        all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
+        all_is_impossible = torch.tensor([f.is_impossible for f in features], dtype=torch.float)
+
+        if not is_training:
+            all_feature_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
+            dataset = TensorDataset(
+                all_input_ids, all_attention_masks, all_token_type_ids, all_feature_index, all_cls_index, all_p_mask
+            )
+        else:
+            all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
+            all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
+            dataset = TensorDataset(
+                all_input_ids,
+                all_attention_masks,
+                all_token_type_ids,
+                all_start_positions,
+                all_end_positions,
+                all_cls_index,
+                all_p_mask,
+                all_is_impossible,
+            )
+
+        return features, dataset
+    elif return_dataset == "tf":
+        if not is_tf_available():
+            raise RuntimeError("TensorFlow must be installed to return a TensorFlow dataset.")
+
+        def gen():
+            for i, ex in enumerate(features):
+                if ex.token_type_ids is None:
+                    yield (
+                        {
+                            "input_ids": ex.input_ids,
+                            "attention_mask": ex.attention_mask,
+                            "feature_index": i,
+                            "qas_id": ex.qas_id,
+                        },
+                        {
+                            "start_positions": ex.start_position,
+                            "end_positions": ex.end_position,
+                            "cls_index": ex.cls_index,
+                            "p_mask": ex.p_mask,
+                            "is_impossible": ex.is_impossible,
+                        },
+                    )
+                else:
+                    yield (
+                        {
+                            "input_ids": ex.input_ids,
+                            "attention_mask": ex.attention_mask,
+                            "token_type_ids": ex.token_type_ids,
+                            "feature_index": i,
+                            "qas_id": ex.qas_id,
+                        },
+                        {
+                            "start_positions": ex.start_position,
+                            "end_positions": ex.end_position,
+                            "cls_index": ex.cls_index,
+                            "p_mask": ex.p_mask,
+                            "is_impossible": ex.is_impossible,
+                        },
+                    )
+
+        # Why have we split the batch into a tuple? PyTorch just has a list of tensors.
+        if "token_type_ids" in tokenizer.model_input_names:
+            train_types = (
+                {
+                    "input_ids": tf.int32,
+                    "attention_mask": tf.int32,
+                    "token_type_ids": tf.int32,
+                    "feature_index": tf.int64,
+                    "qas_id": tf.string,
+                },
+                {
+                    "start_positions": tf.int64,
+                    "end_positions": tf.int64,
+                    "cls_index": tf.int64,
+                    "p_mask": tf.int32,
+                    "is_impossible": tf.int32,
+                },
+            )
+
+            train_shapes = (
+                {
+                    "input_ids": tf.TensorShape([None]),
+                    "attention_mask": tf.TensorShape([None]),
+                    "token_type_ids": tf.TensorShape([None]),
+                    "feature_index": tf.TensorShape([]),
+                    "qas_id": tf.TensorShape([]),
+                },
+                {
+                    "start_positions": tf.TensorShape([]),
+                    "end_positions": tf.TensorShape([]),
+                    "cls_index": tf.TensorShape([]),
+                    "p_mask": tf.TensorShape([None]),
+                    "is_impossible": tf.TensorShape([]),
+                },
+            )
+        else:
+            train_types = (
+                {"input_ids": tf.int32, "attention_mask": tf.int32, "feature_index": tf.int64, "qas_id": tf.string},
+                {
+                    "start_positions": tf.int64,
+                    "end_positions": tf.int64,
+                    "cls_index": tf.int64,
+                    "p_mask": tf.int32,
+                    "is_impossible": tf.int32,
+                },
+            )
+
+            train_shapes = (
+                {
+                    "input_ids": tf.TensorShape([None]),
+                    "attention_mask": tf.TensorShape([None]),
+                    "feature_index": tf.TensorShape([]),
+                    "qas_id": tf.TensorShape([]),
+                },
+                {
+                    "start_positions": tf.TensorShape([]),
+                    "end_positions": tf.TensorShape([]),
+                    "cls_index": tf.TensorShape([]),
+                    "p_mask": tf.TensorShape([None]),
+                    "is_impossible": tf.TensorShape([]),
+                },
+            )
+
+        return tf.data.Dataset.from_generator(gen, train_types, train_shapes)
+    else:
+        return features
+
+
+class SquadProcessor(DataProcessor):
+    """
+    Processor for the SQuAD data set. overridden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and
+    version 2.0 of SQuAD, respectively.
+    """
+
+    train_file = None
+    dev_file = None
+
+    def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False):
+        if not evaluate:
+            answer = tensor_dict["answers"]["text"][0].numpy().decode("utf-8")
+            answer_start = tensor_dict["answers"]["answer_start"][0].numpy()
+            answers = []
+        else:
+            answers = [
+                {"answer_start": start.numpy(), "text": text.numpy().decode("utf-8")}
+                for start, text in zip(tensor_dict["answers"]["answer_start"], tensor_dict["answers"]["text"])
+            ]
+
+            answer = None
+            answer_start = None
+
+        return SquadExample(
+            qas_id=tensor_dict["id"].numpy().decode("utf-8"),
+            question_text=tensor_dict["question"].numpy().decode("utf-8"),
+            context_text=tensor_dict["context"].numpy().decode("utf-8"),
+            answer_text=answer,
+            start_position_character=answer_start,
+            title=tensor_dict["title"].numpy().decode("utf-8"),
+            answers=answers,
+        )
+
+    def get_examples_from_dataset(self, dataset, evaluate=False):
+        """
+        Creates a list of [`~data.processors.squad.SquadExample`] using a TFDS dataset.
+
+        Args:
+            dataset: The tfds dataset loaded from *tensorflow_datasets.load("squad")*
+            evaluate: Boolean specifying if in evaluation mode or in training mode
+
+        Returns:
+            List of SquadExample
+
+        Examples:
+
+        ```python
+        >>> import tensorflow_datasets as tfds
+
+        >>> dataset = tfds.load("squad")
+
+        >>> training_examples = get_examples_from_dataset(dataset, evaluate=False)
+        >>> evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
+        ```"""
+
+        if evaluate:
+            dataset = dataset["validation"]
+        else:
+            dataset = dataset["train"]
+
+        examples = []
+        for tensor_dict in tqdm(dataset):
+            examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate))
+
+        return examples
+
+    def get_train_examples(self, data_dir, filename=None):
+        """
+        Returns the training examples from the data directory.
+
+        Args:
+            data_dir: Directory containing the data files used for training and evaluating.
+            filename: None by default, specify this if the training file has a different name than the original one
+                which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
+
+        """
+        if data_dir is None:
+            data_dir = ""
+
+        if self.train_file is None:
+            raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
+
+        with open(
+            os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8"
+        ) as reader:
+            input_data = json.load(reader)["data"]
+        return self._create_examples(input_data, "train")
+
+    def get_dev_examples(self, data_dir, filename=None):
+        """
+        Returns the evaluation example from the data directory.
+
+        Args:
+            data_dir: Directory containing the data files used for training and evaluating.
+            filename: None by default, specify this if the evaluation file has a different name than the original one
+                which is `dev-v1.1.json` and `dev-v2.0.json` for squad versions 1.1 and 2.0 respectively.
+        """
+        if data_dir is None:
+            data_dir = ""
+
+        if self.dev_file is None:
+            raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
+
+        with open(
+            os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8"
+        ) as reader:
+            input_data = json.load(reader)["data"]
+        return self._create_examples(input_data, "dev")
+
+    def _create_examples(self, input_data, set_type):
+        is_training = set_type == "train"
+        examples = []
+        for entry in tqdm(input_data):
+            title = entry["title"]
+            for paragraph in entry["paragraphs"]:
+                context_text = paragraph["context"]
+                for qa in paragraph["qas"]:
+                    qas_id = qa["id"]
+                    question_text = qa["question"]
+                    start_position_character = None
+                    answer_text = None
+                    answers = []
+
+                    is_impossible = qa.get("is_impossible", False)
+                    if not is_impossible:
+                        if is_training:
+                            answer = qa["answers"][0]
+                            answer_text = answer["text"]
+                            start_position_character = answer["answer_start"]
+                        else:
+                            answers = qa["answers"]
+
+                    example = SquadExample(
+                        qas_id=qas_id,
+                        question_text=question_text,
+                        context_text=context_text,
+                        answer_text=answer_text,
+                        start_position_character=start_position_character,
+                        title=title,
+                        is_impossible=is_impossible,
+                        answers=answers,
+                    )
+                    examples.append(example)
+        return examples
+
+
+class SquadV1Processor(SquadProcessor):
+    train_file = "train-v1.1.json"
+    dev_file = "dev-v1.1.json"
+
+
+class SquadV2Processor(SquadProcessor):
+    train_file = "train-v2.0.json"
+    dev_file = "dev-v2.0.json"
+
+
+class SquadExample:
+    """
+    A single training/test example for the Squad dataset, as loaded from disk.
+
+    Args:
+        qas_id: The example's unique identifier
+        question_text: The question string
+        context_text: The context string
+        answer_text: The answer string
+        start_position_character: The character position of the start of the answer
+        title: The title of the example
+        answers: None by default, this is used during evaluation. Holds answers as well as their start positions.
+        is_impossible: False by default, set to True if the example has no possible answer.
+    """
+
+    def __init__(
+        self,
+        qas_id,
+        question_text,
+        context_text,
+        answer_text,
+        start_position_character,
+        title,
+        answers=[],
+        is_impossible=False,
+    ):
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.context_text = context_text
+        self.answer_text = answer_text
+        self.title = title
+        self.is_impossible = is_impossible
+        self.answers = answers
+
+        self.start_position, self.end_position = 0, 0
+
+        doc_tokens = []
+        char_to_word_offset = []
+        prev_is_whitespace = True
+
+        # Split on whitespace so that different tokens may be attributed to their original position.
+        for c in self.context_text:
+            if _is_whitespace(c):
+                prev_is_whitespace = True
+            else:
+                if prev_is_whitespace:
+                    doc_tokens.append(c)
+                else:
+                    doc_tokens[-1] += c
+                prev_is_whitespace = False
+            char_to_word_offset.append(len(doc_tokens) - 1)
+
+        self.doc_tokens = doc_tokens
+        self.char_to_word_offset = char_to_word_offset
+
+        # Start and end positions only has a value during evaluation.
+        if start_position_character is not None and not is_impossible:
+            self.start_position = char_to_word_offset[start_position_character]
+            self.end_position = char_to_word_offset[
+                min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1)
+            ]
+
+
+class SquadFeatures:
+    """
+    Single squad example features to be fed to a model. Those features are model-specific and can be crafted from
+    [`~data.processors.squad.SquadExample`] using the
+    :method:*~transformers.data.processors.squad.squad_convert_examples_to_features* method.
+
+    Args:
+        input_ids: Indices of input sequence tokens in the vocabulary.
+        attention_mask: Mask to avoid performing attention on padding token indices.
+        token_type_ids: Segment token indices to indicate first and second portions of the inputs.
+        cls_index: the index of the CLS token.
+        p_mask: Mask identifying tokens that can be answers vs. tokens that cannot.
+            Mask with 1 for tokens than cannot be in the answer and 0 for token that can be in an answer
+        example_index: the index of the example
+        unique_id: The unique Feature identifier
+        paragraph_len: The length of the context
+        token_is_max_context:
+            List of booleans identifying which tokens have their maximum context in this feature object. If a token
+            does not have their maximum context in this feature object, it means that another feature object has more
+            information related to that token and should be prioritized over this feature for that token.
+        tokens: list of tokens corresponding to the input ids
+        token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer.
+        start_position: start of the answer token index
+        end_position: end of the answer token index
+        encoding: optionally store the BatchEncoding with the fast-tokenizer alignment methods.
+    """
+
+    def __init__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        cls_index,
+        p_mask,
+        example_index,
+        unique_id,
+        paragraph_len,
+        token_is_max_context,
+        tokens,
+        token_to_orig_map,
+        start_position,
+        end_position,
+        is_impossible,
+        qas_id: Optional[str] = None,
+        encoding: Optional[BatchEncoding] = None,
+    ):
+        self.input_ids = input_ids
+        self.attention_mask = attention_mask
+        self.token_type_ids = token_type_ids
+        self.cls_index = cls_index
+        self.p_mask = p_mask
+
+        self.example_index = example_index
+        self.unique_id = unique_id
+        self.paragraph_len = paragraph_len
+        self.token_is_max_context = token_is_max_context
+        self.tokens = tokens
+        self.token_to_orig_map = token_to_orig_map
+
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+        self.qas_id = qas_id
+
+        self.encoding = encoding
+
+
+class SquadResult:
+    """
+    Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset.
+
+    Args:
+        unique_id: The unique identifier corresponding to that example.
+        start_logits: The logits corresponding to the start of the answer
+        end_logits: The logits corresponding to the end of the answer
+    """
+
+    def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None):
+        self.start_logits = start_logits
+        self.end_logits = end_logits
+        self.unique_id = unique_id
+
+        if start_top_index:
+            self.start_top_index = start_top_index
+            self.end_top_index = end_top_index
+            self.cls_logits = cls_logits
diff --git a/venv/lib/python3.13/site-packages/transformers/data/processors/utils.py b/venv/lib/python3.13/site-packages/transformers/data/processors/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..462156ebac384e08d78a7b42ea06f35a457e5feb
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/data/processors/utils.py
@@ -0,0 +1,349 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import csv
+import dataclasses
+import json
+from dataclasses import dataclass
+from typing import Optional, Union
+
+from ...utils import is_tf_available, is_torch_available, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+class InputExample:
+    """
+    A single training/test example for simple sequence classification.
+
+    Args:
+        guid: Unique id for the example.
+        text_a: string. The untokenized text of the first sequence. For single
+            sequence tasks, only this sequence must be specified.
+        text_b: (Optional) string. The untokenized text of the second sequence.
+            Only must be specified for sequence pair tasks.
+        label: (Optional) string. The label of the example. This should be
+            specified for train and dev examples, but not for test examples.
+    """
+
+    guid: str
+    text_a: str
+    text_b: Optional[str] = None
+    label: Optional[str] = None
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(dataclasses.asdict(self), indent=2) + "\n"
+
+
+@dataclass(frozen=True)
+class InputFeatures:
+    """
+    A single set of features of data. Property names are the same names as the corresponding inputs to a model.
+
+    Args:
+        input_ids: Indices of input sequence tokens in the vocabulary.
+        attention_mask: Mask to avoid performing attention on padding token indices.
+            Mask values selected in `[0, 1]`: Usually `1` for tokens that are NOT MASKED, `0` for MASKED (padded)
+            tokens.
+        token_type_ids: (Optional) Segment token indices to indicate first and second
+            portions of the inputs. Only some models use them.
+        label: (Optional) Label corresponding to the input. Int for classification problems,
+            float for regression problems.
+    """
+
+    input_ids: list[int]
+    attention_mask: Optional[list[int]] = None
+    token_type_ids: Optional[list[int]] = None
+    label: Optional[Union[int, float]] = None
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(dataclasses.asdict(self)) + "\n"
+
+
+class DataProcessor:
+    """Base class for data converters for sequence classification data sets."""
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """
+        Gets an example from a dict with tensorflow tensors.
+
+        Args:
+            tensor_dict: Keys and values should match the corresponding Glue
+                tensorflow_dataset examples.
+        """
+        raise NotImplementedError()
+
+    def get_train_examples(self, data_dir):
+        """Gets a collection of [`InputExample`] for the train set."""
+        raise NotImplementedError()
+
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of [`InputExample`] for the dev set."""
+        raise NotImplementedError()
+
+    def get_test_examples(self, data_dir):
+        """Gets a collection of [`InputExample`] for the test set."""
+        raise NotImplementedError()
+
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+
+    def tfds_map(self, example):
+        """
+        Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. This method converts
+        examples to the correct format.
+        """
+        if len(self.get_labels()) > 1:
+            example.label = self.get_labels()[int(example.label)]
+        return example
+
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with open(input_file, "r", encoding="utf-8-sig") as f:
+            return list(csv.reader(f, delimiter="\t", quotechar=quotechar))
+
+
+class SingleSentenceClassificationProcessor(DataProcessor):
+    """Generic processor for a single sentence classification data set."""
+
+    def __init__(self, labels=None, examples=None, mode="classification", verbose=False):
+        self.labels = [] if labels is None else labels
+        self.examples = [] if examples is None else examples
+        self.mode = mode
+        self.verbose = verbose
+
+    def __len__(self):
+        return len(self.examples)
+
+    def __getitem__(self, idx):
+        if isinstance(idx, slice):
+            return SingleSentenceClassificationProcessor(labels=self.labels, examples=self.examples[idx])
+        return self.examples[idx]
+
+    @classmethod
+    def create_from_csv(
+        cls, file_name, split_name="", column_label=0, column_text=1, column_id=None, skip_first_row=False, **kwargs
+    ):
+        processor = cls(**kwargs)
+        processor.add_examples_from_csv(
+            file_name,
+            split_name=split_name,
+            column_label=column_label,
+            column_text=column_text,
+            column_id=column_id,
+            skip_first_row=skip_first_row,
+            overwrite_labels=True,
+            overwrite_examples=True,
+        )
+        return processor
+
+    @classmethod
+    def create_from_examples(cls, texts_or_text_and_labels, labels=None, **kwargs):
+        processor = cls(**kwargs)
+        processor.add_examples(texts_or_text_and_labels, labels=labels)
+        return processor
+
+    def add_examples_from_csv(
+        self,
+        file_name,
+        split_name="",
+        column_label=0,
+        column_text=1,
+        column_id=None,
+        skip_first_row=False,
+        overwrite_labels=False,
+        overwrite_examples=False,
+    ):
+        lines = self._read_tsv(file_name)
+        if skip_first_row:
+            lines = lines[1:]
+        texts = []
+        labels = []
+        ids = []
+        for i, line in enumerate(lines):
+            texts.append(line[column_text])
+            labels.append(line[column_label])
+            if column_id is not None:
+                ids.append(line[column_id])
+            else:
+                guid = f"{split_name}-{i}" if split_name else str(i)
+                ids.append(guid)
+
+        return self.add_examples(
+            texts, labels, ids, overwrite_labels=overwrite_labels, overwrite_examples=overwrite_examples
+        )
+
+    def add_examples(
+        self, texts_or_text_and_labels, labels=None, ids=None, overwrite_labels=False, overwrite_examples=False
+    ):
+        if labels is not None and len(texts_or_text_and_labels) != len(labels):
+            raise ValueError(
+                f"Text and labels have mismatched lengths {len(texts_or_text_and_labels)} and {len(labels)}"
+            )
+        if ids is not None and len(texts_or_text_and_labels) != len(ids):
+            raise ValueError(f"Text and ids have mismatched lengths {len(texts_or_text_and_labels)} and {len(ids)}")
+        if ids is None:
+            ids = [None] * len(texts_or_text_and_labels)
+        if labels is None:
+            labels = [None] * len(texts_or_text_and_labels)
+        examples = []
+        added_labels = set()
+        for text_or_text_and_label, label, guid in zip(texts_or_text_and_labels, labels, ids):
+            if isinstance(text_or_text_and_label, (tuple, list)) and label is None:
+                text, label = text_or_text_and_label
+            else:
+                text = text_or_text_and_label
+            added_labels.add(label)
+            examples.append(InputExample(guid=guid, text_a=text, text_b=None, label=label))
+
+        # Update examples
+        if overwrite_examples:
+            self.examples = examples
+        else:
+            self.examples.extend(examples)
+
+        # Update labels
+        if overwrite_labels:
+            self.labels = list(added_labels)
+        else:
+            self.labels = list(set(self.labels).union(added_labels))
+
+        return self.examples
+
+    def get_features(
+        self,
+        tokenizer,
+        max_length=None,
+        pad_on_left=False,
+        pad_token=0,
+        mask_padding_with_zero=True,
+        return_tensors=None,
+    ):
+        """
+        Convert examples in a list of `InputFeatures`
+
+        Args:
+            tokenizer: Instance of a tokenizer that will tokenize the examples
+            max_length: Maximum example length
+            pad_on_left: If set to `True`, the examples will be padded on the left rather than on the right (default)
+            pad_token: Padding token
+            mask_padding_with_zero: If set to `True`, the attention mask will be filled by `1` for actual values
+                and by `0` for padded values. If set to `False`, inverts it (`1` for padded values, `0` for actual
+                values)
+
+        Returns:
+            If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the
+            task-specific features. If the input is a list of `InputExamples`, will return a list of task-specific
+            `InputFeatures` which can be fed to the model.
+
+        """
+        if max_length is None:
+            max_length = tokenizer.max_len
+
+        label_map = {label: i for i, label in enumerate(self.labels)}
+
+        all_input_ids = []
+        for ex_index, example in enumerate(self.examples):
+            if ex_index % 10000 == 0:
+                logger.info(f"Tokenizing example {ex_index}")
+
+            input_ids = tokenizer.encode(
+                example.text_a,
+                add_special_tokens=True,
+                max_length=min(max_length, tokenizer.max_len),
+            )
+            all_input_ids.append(input_ids)
+
+        batch_length = max(len(input_ids) for input_ids in all_input_ids)
+
+        features = []
+        for ex_index, (input_ids, example) in enumerate(zip(all_input_ids, self.examples)):
+            if ex_index % 10000 == 0:
+                logger.info(f"Writing example {ex_index}/{len(self.examples)}")
+            # The mask has 1 for real tokens and 0 for padding tokens. Only real
+            # tokens are attended to.
+            attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            padding_length = batch_length - len(input_ids)
+            if pad_on_left:
+                input_ids = ([pad_token] * padding_length) + input_ids
+                attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
+            else:
+                input_ids = input_ids + ([pad_token] * padding_length)
+                attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+
+            if len(input_ids) != batch_length:
+                raise ValueError(f"Error with input length {len(input_ids)} vs {batch_length}")
+            if len(attention_mask) != batch_length:
+                raise ValueError(f"Error with input length {len(attention_mask)} vs {batch_length}")
+
+            if self.mode == "classification":
+                label = label_map[example.label]
+            elif self.mode == "regression":
+                label = float(example.label)
+            else:
+                raise ValueError(self.mode)
+
+            if ex_index < 5 and self.verbose:
+                logger.info("*** Example ***")
+                logger.info(f"guid: {example.guid}")
+                logger.info(f"input_ids: {' '.join([str(x) for x in input_ids])}")
+                logger.info(f"attention_mask: {' '.join([str(x) for x in attention_mask])}")
+                logger.info(f"label: {example.label} (id = {label})")
+
+            features.append(InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=label))
+
+        if return_tensors is None:
+            return features
+        elif return_tensors == "tf":
+            if not is_tf_available():
+                raise RuntimeError("return_tensors set to 'tf' but TensorFlow 2.0 can't be imported")
+            import tensorflow as tf
+
+            def gen():
+                for ex in features:
+                    yield ({"input_ids": ex.input_ids, "attention_mask": ex.attention_mask}, ex.label)
+
+            dataset = tf.data.Dataset.from_generator(
+                gen,
+                ({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int64),
+                ({"input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None])}, tf.TensorShape([])),
+            )
+            return dataset
+        elif return_tensors == "pt":
+            if not is_torch_available():
+                raise RuntimeError("return_tensors set to 'pt' but PyTorch can't be imported")
+            import torch
+            from torch.utils.data import TensorDataset
+
+            all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+            all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+            if self.mode == "classification":
+                all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
+            elif self.mode == "regression":
+                all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
+
+            dataset = TensorDataset(all_input_ids, all_attention_mask, all_labels)
+            return dataset
+        else:
+            raise ValueError("return_tensors should be one of 'tf' or 'pt'")
diff --git a/venv/lib/python3.13/site-packages/transformers/data/processors/xnli.py b/venv/lib/python3.13/site-packages/transformers/data/processors/xnli.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d8ec17a8345db5bf08325a334a4c6eb8af29157
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/data/processors/xnli.py
@@ -0,0 +1,96 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""XNLI utils (dataset loading and evaluation)"""
+
+import os
+
+from ...utils import logging
+from .utils import DataProcessor, InputExample
+
+
+logger = logging.get_logger(__name__)
+
+
+class XnliProcessor(DataProcessor):
+    """
+    Processor for the XNLI dataset. Adapted from
+    https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207
+    """
+
+    def __init__(self, language, train_language=None):
+        self.language = language
+        self.train_language = train_language
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        lg = self.language if self.train_language is None else self.train_language
+        lines = self._read_tsv(os.path.join(data_dir, f"XNLI-MT-1.0/multinli/multinli.train.{lg}.tsv"))
+        examples = []
+        for i, line in enumerate(lines):
+            if i == 0:
+                continue
+            guid = f"train-{i}"
+            text_a = line[0]
+            text_b = line[1]
+            label = "contradiction" if line[2] == "contradictory" else line[2]
+            if not isinstance(text_a, str):
+                raise TypeError(f"Training input {text_a} is not a string")
+            if not isinstance(text_b, str):
+                raise TypeError(f"Training input {text_b} is not a string")
+            if not isinstance(label, str):
+                raise TypeError(f"Training label {label} is not a string")
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        lines = self._read_tsv(os.path.join(data_dir, "XNLI-1.0/xnli.test.tsv"))
+        examples = []
+        for i, line in enumerate(lines):
+            if i == 0:
+                continue
+            language = line[0]
+            if language != self.language:
+                continue
+            guid = f"test-{i}"
+            text_a = line[6]
+            text_b = line[7]
+            label = line[1]
+            if not isinstance(text_a, str):
+                raise TypeError(f"Training input {text_a} is not a string")
+            if not isinstance(text_b, str):
+                raise TypeError(f"Training input {text_b} is not a string")
+            if not isinstance(label, str):
+                raise TypeError(f"Training label {label} is not a string")
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+    def get_labels(self):
+        """See base class."""
+        return ["contradiction", "entailment", "neutral"]
+
+
+xnli_processors = {
+    "xnli": XnliProcessor,
+}
+
+xnli_output_modes = {
+    "xnli": "classification",
+}
+
+xnli_tasks_num_labels = {
+    "xnli": 3,
+}
diff --git a/venv/lib/python3.13/site-packages/transformers/integrations/__init__.py b/venv/lib/python3.13/site-packages/transformers/integrations/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..58ca68bb332649a0a5557d54621005fe5eb9ea3f
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/integrations/__init__.py
@@ -0,0 +1,310 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ..utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_torch_greater_or_equal
+
+
+_import_structure = {
+    "aqlm": ["replace_with_aqlm_linear"],
+    "awq": [
+        "fuse_awq_modules",
+        "post_init_awq_exllama_modules",
+        "post_init_awq_ipex_modules",
+        "replace_quantization_scales",
+        "replace_with_awq_linear",
+    ],
+    "bitnet": [
+        "BitLinear",
+        "pack_weights",
+        "replace_with_bitnet_linear",
+        "unpack_weights",
+    ],
+    "bitsandbytes": [
+        "dequantize_and_replace",
+        "get_keys_to_not_convert",
+        "replace_8bit_linear",
+        "replace_with_bnb_linear",
+        "set_module_8bit_tensor_to_device",
+        "set_module_quantized_tensor_to_device",
+        "validate_bnb_backend_availability",
+    ],
+    "deepspeed": [
+        "HfDeepSpeedConfig",
+        "HfTrainerDeepSpeedConfig",
+        "deepspeed_config",
+        "deepspeed_init",
+        "deepspeed_load_checkpoint",
+        "deepspeed_optim_sched",
+        "is_deepspeed_available",
+        "is_deepspeed_zero3_enabled",
+        "set_hf_deepspeed_config",
+        "unset_hf_deepspeed_config",
+    ],
+    "eetq": ["replace_with_eetq_linear"],
+    "fbgemm_fp8": ["FbgemmFp8Linear", "FbgemmFp8Llama4TextExperts", "replace_with_fbgemm_fp8_linear"],
+    "finegrained_fp8": ["FP8Linear", "replace_with_fp8_linear"],
+    "fsdp": ["is_fsdp_enabled", "is_fsdp_managed_module"],
+    "ggml": [
+        "GGUF_CONFIG_MAPPING",
+        "GGUF_TOKENIZER_MAPPING",
+        "_gguf_parse_value",
+        "load_dequant_gguf_tensor",
+        "load_gguf",
+    ],
+    "higgs": [
+        "HiggsLinear",
+        "dequantize_higgs",
+        "quantize_with_higgs",
+        "replace_with_higgs_linear",
+    ],
+    "hqq": ["prepare_for_hqq_linear"],
+    "hub_kernels": [
+        "LayerRepository",
+        "register_kernel_mapping",
+        "replace_kernel_forward_from_hub",
+        "use_kernel_forward_from_hub",
+    ],
+    "integration_utils": [
+        "INTEGRATION_TO_CALLBACK",
+        "AzureMLCallback",
+        "ClearMLCallback",
+        "CodeCarbonCallback",
+        "CometCallback",
+        "DagsHubCallback",
+        "DVCLiveCallback",
+        "FlyteCallback",
+        "MLflowCallback",
+        "NeptuneCallback",
+        "NeptuneMissingConfiguration",
+        "SwanLabCallback",
+        "TensorBoardCallback",
+        "TrackioCallback",
+        "WandbCallback",
+        "get_available_reporting_integrations",
+        "get_reporting_integration_callbacks",
+        "hp_params",
+        "is_azureml_available",
+        "is_clearml_available",
+        "is_codecarbon_available",
+        "is_comet_available",
+        "is_dagshub_available",
+        "is_dvclive_available",
+        "is_flyte_deck_standard_available",
+        "is_flytekit_available",
+        "is_mlflow_available",
+        "is_neptune_available",
+        "is_optuna_available",
+        "is_ray_available",
+        "is_ray_tune_available",
+        "is_sigopt_available",
+        "is_swanlab_available",
+        "is_tensorboard_available",
+        "is_trackio_available",
+        "is_wandb_available",
+        "rewrite_logs",
+        "run_hp_search_optuna",
+        "run_hp_search_ray",
+        "run_hp_search_sigopt",
+        "run_hp_search_wandb",
+    ],
+    "mxfp4": [
+        "Mxfp4GptOssExperts",
+        "convert_moe_packed_tensors",
+        "dequantize",
+        "load_and_swizzle_mxfp4",
+        "quantize_to_mxfp4",
+        "replace_with_mxfp4_linear",
+        "swizzle_mxfp4",
+    ],
+    "peft": ["PeftAdapterMixin"],
+    "quanto": ["replace_with_quanto_layers"],
+    "spqr": ["replace_with_spqr_linear"],
+    "vptq": ["replace_with_vptq_linear"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["executorch"] = [
+        "TorchExportableModuleWithStaticCache",
+        "convert_and_export_with_cache",
+    ]
+
+try:
+    if not is_torch_greater_or_equal("2.3"):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["tensor_parallel"] = [
+        "shard_and_distribute_module",
+        "ALL_PARALLEL_STYLES",
+        "translate_to_torch_parallel_style",
+    ]
+try:
+    if not is_torch_greater_or_equal("2.5"):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["flex_attention"] = [
+        "make_flex_block_causal_mask",
+    ]
+
+if TYPE_CHECKING:
+    from .aqlm import replace_with_aqlm_linear
+    from .awq import (
+        fuse_awq_modules,
+        post_init_awq_exllama_modules,
+        post_init_awq_ipex_modules,
+        replace_quantization_scales,
+        replace_with_awq_linear,
+    )
+    from .bitnet import (
+        BitLinear,
+        pack_weights,
+        replace_with_bitnet_linear,
+        unpack_weights,
+    )
+    from .bitsandbytes import (
+        dequantize_and_replace,
+        get_keys_to_not_convert,
+        replace_8bit_linear,
+        replace_with_bnb_linear,
+        set_module_8bit_tensor_to_device,
+        set_module_quantized_tensor_to_device,
+        validate_bnb_backend_availability,
+    )
+    from .deepspeed import (
+        HfDeepSpeedConfig,
+        HfTrainerDeepSpeedConfig,
+        deepspeed_config,
+        deepspeed_init,
+        deepspeed_load_checkpoint,
+        deepspeed_optim_sched,
+        is_deepspeed_available,
+        is_deepspeed_zero3_enabled,
+        set_hf_deepspeed_config,
+        unset_hf_deepspeed_config,
+    )
+    from .eetq import replace_with_eetq_linear
+    from .fbgemm_fp8 import FbgemmFp8Linear, FbgemmFp8Llama4TextExperts, replace_with_fbgemm_fp8_linear
+    from .finegrained_fp8 import FP8Linear, replace_with_fp8_linear
+    from .fsdp import is_fsdp_enabled, is_fsdp_managed_module
+    from .ggml import (
+        GGUF_CONFIG_MAPPING,
+        GGUF_TOKENIZER_MAPPING,
+        _gguf_parse_value,
+        load_dequant_gguf_tensor,
+        load_gguf,
+    )
+    from .higgs import HiggsLinear, dequantize_higgs, quantize_with_higgs, replace_with_higgs_linear
+    from .hqq import prepare_for_hqq_linear
+    from .hub_kernels import (
+        LayerRepository,
+        register_kernel_mapping,
+        replace_kernel_forward_from_hub,
+        use_kernel_forward_from_hub,
+    )
+    from .integration_utils import (
+        INTEGRATION_TO_CALLBACK,
+        AzureMLCallback,
+        ClearMLCallback,
+        CodeCarbonCallback,
+        CometCallback,
+        DagsHubCallback,
+        DVCLiveCallback,
+        FlyteCallback,
+        MLflowCallback,
+        NeptuneCallback,
+        NeptuneMissingConfiguration,
+        SwanLabCallback,
+        TensorBoardCallback,
+        TrackioCallback,
+        WandbCallback,
+        get_available_reporting_integrations,
+        get_reporting_integration_callbacks,
+        hp_params,
+        is_azureml_available,
+        is_clearml_available,
+        is_codecarbon_available,
+        is_comet_available,
+        is_dagshub_available,
+        is_dvclive_available,
+        is_flyte_deck_standard_available,
+        is_flytekit_available,
+        is_mlflow_available,
+        is_neptune_available,
+        is_optuna_available,
+        is_ray_available,
+        is_ray_tune_available,
+        is_sigopt_available,
+        is_swanlab_available,
+        is_tensorboard_available,
+        is_trackio_available,
+        is_wandb_available,
+        rewrite_logs,
+        run_hp_search_optuna,
+        run_hp_search_ray,
+        run_hp_search_sigopt,
+        run_hp_search_wandb,
+    )
+    from .mxfp4 import (
+        Mxfp4GptOssExperts,
+        dequantize,
+        load_and_swizzle_mxfp4,
+        quantize_to_mxfp4,
+        replace_with_mxfp4_linear,
+        swizzle_mxfp4,
+    )
+    from .peft import PeftAdapterMixin
+    from .quanto import replace_with_quanto_layers
+    from .spqr import replace_with_spqr_linear
+    from .vptq import replace_with_vptq_linear
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .executorch import TorchExportableModuleWithStaticCache, convert_and_export_with_cache
+
+    try:
+        if not is_torch_greater_or_equal("2.3"):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .tensor_parallel import (
+            ALL_PARALLEL_STYLES,
+            shard_and_distribute_module,
+            translate_to_torch_parallel_style,
+        )
+
+    try:
+        if not is_torch_greater_or_equal("2.5"):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .flex_attention import make_flex_block_causal_mask
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/integrations/bitsandbytes.py b/venv/lib/python3.13/site-packages/transformers/integrations/bitsandbytes.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dd32b42687e8279cef4a6fe5a22a650226d75ff
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/integrations/bitsandbytes.py
@@ -0,0 +1,542 @@
+import importlib.metadata
+import inspect
+import warnings
+from copy import deepcopy
+from inspect import signature
+
+from packaging import version
+
+from ..utils import (
+    get_available_devices,
+    is_accelerate_available,
+    is_bitsandbytes_available,
+    is_bitsandbytes_multi_backend_available,
+    is_torch_available,
+    logging,
+)
+
+
+if is_bitsandbytes_available():
+    import bitsandbytes as bnb
+    import torch
+    import torch.nn as nn
+
+    from ..pytorch_utils import Conv1D
+
+if is_accelerate_available():
+    import accelerate
+    from accelerate import init_empty_weights
+    from accelerate.hooks import add_hook_to_module, remove_hook_from_module
+    from accelerate.utils import find_tied_parameters
+
+logger = logging.get_logger(__name__)
+
+
+def set_module_quantized_tensor_to_device(module, tensor_name, device, value=None, quantized_stats=None):
+    """
+    A helper function to set a given tensor (parameter of buffer) of a module on a specific device (note that doing
+    `param.to(device)` creates a new tensor not linked to the parameter, which is why we need this function). The
+    function is adapted from `set_module_tensor_to_device` function from accelerate that is adapted to support the
+    class `Int8Params` from `bitsandbytes`.
+
+    Args:
+        module (`torch.nn.Module`):
+            The module in which the tensor we want to move lives.
+        tensor_name (`str`):
+            The full name of the parameter/buffer.
+        device (`int`, `str` or `torch.device`):
+            The device on which to set the tensor.
+        value (`torch.Tensor`, *optional*):
+            The value of the tensor (useful when going from the meta device to any other device).
+        quantized_stats (`dict[str, Any]`, *optional*):
+            Dict with items for either 4-bit or 8-bit serialization
+    """
+    # Recurse if needed
+    if "." in tensor_name:
+        splits = tensor_name.split(".")
+        for split in splits[:-1]:
+            new_module = getattr(module, split)
+            if new_module is None:
+                raise ValueError(f"{module} has no attribute {split}.")
+            module = new_module
+        tensor_name = splits[-1]
+
+    if tensor_name not in module._parameters and tensor_name not in module._buffers:
+        raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.")
+    is_buffer = tensor_name in module._buffers
+    old_value = getattr(module, tensor_name)
+
+    if old_value.device == torch.device("meta") and device not in ["meta", torch.device("meta")] and value is None:
+        raise ValueError(f"{tensor_name} is on the meta device, we need a `value` to put in on {device}.")
+
+    prequantized_loading = quantized_stats is not None
+    if is_buffer or not is_bitsandbytes_available():
+        is_8bit = False
+        is_4bit = False
+    else:
+        is_4bit = hasattr(bnb.nn, "Params4bit") and isinstance(module._parameters[tensor_name], bnb.nn.Params4bit)
+        is_8bit = isinstance(module._parameters[tensor_name], bnb.nn.Int8Params)
+
+    if is_8bit or is_4bit:
+        param = module._parameters[tensor_name]
+        if param.device.type != "cuda":
+            if value is None:
+                new_value = old_value.to(device)
+            elif isinstance(value, torch.Tensor):
+                new_value = value.to("cpu")
+            else:
+                new_value = torch.tensor(value, device="cpu")
+
+            # Support models using `Conv1D` in place of `nn.Linear` (e.g. openai-community/gpt2) by transposing the weight matrix prior to quantization.
+            # Since weights are saved in the correct "orientation", we skip transposing when loading.
+            if issubclass(module.source_cls, Conv1D) and not prequantized_loading:
+                new_value = new_value.T
+
+            kwargs = old_value.__dict__
+
+            if prequantized_loading != (new_value.dtype in (torch.int8, torch.uint8)):
+                raise ValueError(
+                    f"Value dtype `{new_value.dtype}` is not compatible with parameter quantization status."
+                )
+
+            if is_8bit:
+                is_8bit_serializable = version.parse(importlib.metadata.version("bitsandbytes")) > version.parse(
+                    "0.37.2"
+                )
+                if new_value.dtype in (torch.int8, torch.uint8) and not is_8bit_serializable:
+                    raise ValueError(
+                        "Detected int8 weights but the version of bitsandbytes is not compatible with int8 serialization. "
+                        "Make sure to download the latest `bitsandbytes` version. `pip install --upgrade bitsandbytes`."
+                    )
+                new_value = bnb.nn.Int8Params(new_value, requires_grad=False, **kwargs).to(device)
+                if prequantized_loading:
+                    setattr(new_value, "SCB", quantized_stats["SCB"].to(device))
+            elif is_4bit:
+                if prequantized_loading:
+                    is_4bit_serializable = version.parse(importlib.metadata.version("bitsandbytes")) >= version.parse(
+                        "0.41.3"
+                    )
+                    if new_value.dtype in (torch.int8, torch.uint8) and not is_4bit_serializable:
+                        raise ValueError(
+                            "Detected 4-bit weights but the version of bitsandbytes is not compatible with 4-bit serialization. "
+                            "Make sure to download the latest `bitsandbytes` version. `pip install --upgrade bitsandbytes`."
+                        )
+                    new_value = bnb.nn.Params4bit.from_prequantized(
+                        data=new_value,
+                        quantized_stats=quantized_stats,
+                        requires_grad=False,
+                        device=device,
+                        **kwargs,
+                    )
+                else:
+                    new_value = bnb.nn.Params4bit(new_value, requires_grad=False, **kwargs).to(device)
+            module._parameters[tensor_name] = new_value
+
+    else:
+        if value is None:
+            new_value = old_value.to(device)
+        elif isinstance(value, torch.Tensor):
+            new_value = value.to(device)
+        else:
+            new_value = torch.tensor(value, device=device)
+
+        if is_buffer:
+            module._buffers[tensor_name] = new_value
+        else:
+            new_value = nn.Parameter(new_value, requires_grad=old_value.requires_grad)
+            module._parameters[tensor_name] = new_value
+
+
+def _replace_with_bnb_linear(
+    model,
+    modules_to_not_convert=None,
+    current_key_name=None,
+    quantization_config=None,
+    has_been_replaced=False,
+):
+    """
+    Private method that wraps the recursion for module replacement.
+
+    Returns the converted model and a boolean that indicates if the conversion has been successful or not.
+    """
+    for name, module in model.named_children():
+        if current_key_name is None:
+            current_key_name = []
+        current_key_name.append(name)
+
+        if (isinstance(module, (nn.Linear, Conv1D))) and name not in modules_to_not_convert:
+            # Check if the current key is not in the `modules_to_not_convert`
+            current_key_name_str = ".".join(current_key_name)
+            if not any(
+                (key + "." in current_key_name_str) or (key == current_key_name_str) for key in modules_to_not_convert
+            ):
+                with init_empty_weights():
+                    if isinstance(module, Conv1D):
+                        in_features, out_features = module.weight.shape
+                    else:
+                        in_features = module.in_features
+                        out_features = module.out_features
+
+                    if quantization_config.quantization_method() == "llm_int8":
+                        model._modules[name] = bnb.nn.Linear8bitLt(
+                            in_features,
+                            out_features,
+                            module.bias is not None,
+                            has_fp16_weights=quantization_config.llm_int8_has_fp16_weight,
+                            threshold=quantization_config.llm_int8_threshold,
+                        )
+                        has_been_replaced = True
+                    else:
+                        if (
+                            quantization_config.llm_int8_skip_modules is not None
+                            and name in quantization_config.llm_int8_skip_modules
+                        ):
+                            pass
+                        else:
+                            extra_kwargs = (
+                                {"quant_storage": quantization_config.bnb_4bit_quant_storage}
+                                if "quant_storage" in list(signature(bnb.nn.Linear4bit).parameters)
+                                else {}
+                            )
+                            model._modules[name] = bnb.nn.Linear4bit(
+                                in_features,
+                                out_features,
+                                module.bias is not None,
+                                quantization_config.bnb_4bit_compute_dtype,
+                                compress_statistics=quantization_config.bnb_4bit_use_double_quant,
+                                quant_type=quantization_config.bnb_4bit_quant_type,
+                                **extra_kwargs,
+                            )
+                            has_been_replaced = True
+                    # Store the module class in case we need to transpose the weight later
+                    model._modules[name].source_cls = type(module)
+                    # Force requires grad to False to avoid unexpected errors
+                    model._modules[name].requires_grad_(False)
+        if len(list(module.children())) > 0:
+            _, has_been_replaced = _replace_with_bnb_linear(
+                module,
+                modules_to_not_convert,
+                current_key_name,
+                quantization_config,
+                has_been_replaced=has_been_replaced,
+            )
+        # Remove the last key for recursion
+        current_key_name.pop(-1)
+    return model, has_been_replaced
+
+
+def replace_with_bnb_linear(model, modules_to_not_convert=None, current_key_name=None, quantization_config=None):
+    """
+    A helper function to replace all `torch.nn.Linear` modules by `bnb.nn.Linear8bit` modules from the `bitsandbytes`
+    library. This will enable running your models using mixed int8 precision as described by the paper `LLM.int8():
+    8-bit Matrix Multiplication for Transformers at Scale`. Make sure `bitsandbytes` compiled with the correct CUDA
+    version of your hardware is installed before running this function. `pip install -i https://test.pypi.org/simple/
+    bitsandbytes`
+
+    The function will be run recursively and replace all `torch.nn.Linear` modules except for the `lm_head` that should
+    be kept as a `torch.nn.Linear` module. The replacement is done under `init_empty_weights` context manager so no
+    CPU/GPU memory is required to run this function. Int8 mixed-precision matrix decomposition works by separating a
+    matrix multiplication into two streams: (1) and systematic feature outlier stream matrix multiplied in fp16
+    (0.01%), (2) a regular stream of int8 matrix multiplication (99.9%). With this method, int8 inference with no
+    predictive degradation is possible for very large models (>=176B parameters).
+
+    Parameters:
+        model (`torch.nn.Module`):
+            Input model or `torch.nn.Module` as the function is run recursively.
+        modules_to_not_convert (`list[`str`]`, *optional*, defaults to `["lm_head"]`):
+            Names of the modules to not convert in `Linear8bitLt`. In practice we keep the `lm_head` in full precision
+            for numerical stability reasons.
+        current_key_name (`list[`str`]`, *optional*):
+            An array to track the current key of the recursion. This is used to check whether the current key (part of
+            it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or
+            `disk`).
+        quantization_config ('transformers.utils.quantization_config.BitsAndBytesConfig'):
+            To configure and manage settings related to quantization, a technique used to compress neural network models
+            by reducing the precision of the weights and activations, thus making models more efficient in terms of both
+            storage and computation.
+    """
+    modules_to_not_convert = ["lm_head"] if modules_to_not_convert is None else modules_to_not_convert
+    model, has_been_replaced = _replace_with_bnb_linear(
+        model, modules_to_not_convert, current_key_name, quantization_config
+    )
+
+    if not has_been_replaced:
+        logger.warning(
+            "You are loading your model in 8bit or 4bit but no linear modules were found in your model."
+            " Please double check your model architecture, or submit an issue on github if you think this is"
+            " a bug."
+        )
+
+    return model
+
+
+# For backward compatibility
+def replace_8bit_linear(*args, **kwargs):
+    warnings.warn(
+        "`replace_8bit_linear` will be deprecated in a future version, please use `replace_with_bnb_linear` instead",
+        FutureWarning,
+    )
+    return replace_with_bnb_linear(*args, **kwargs)
+
+
+# For backward compatibility
+def set_module_8bit_tensor_to_device(*args, **kwargs):
+    warnings.warn(
+        "`set_module_8bit_tensor_to_device` will be deprecated in a future version, please use `set_module_quantized_tensor_to_device` instead",
+        FutureWarning,
+    )
+    return set_module_quantized_tensor_to_device(*args, **kwargs)
+
+
+def get_keys_to_not_convert(model):
+    r"""
+    An utility function to get the key of the module to keep in full precision if any For example for CausalLM modules
+    we may want to keep the lm_head in full precision for numerical stability reasons. For other architectures, we want
+    to keep the tied weights of the model. The function will return a list of the keys of the modules to not convert in
+    int8.
+
+    Parameters:
+    model (`torch.nn.Module`):
+        Input model
+    """
+    # Create a copy of the model and tie the weights, then
+    # check if it contains tied weights
+    tied_model = deepcopy(model)  # this has 0 cost since it is done inside `init_empty_weights` context manager`
+    tied_model.tie_weights()
+
+    tied_params = find_tied_parameters(tied_model)
+    # For compatibility with Accelerate < 0.18
+    if isinstance(tied_params, dict):
+        tied_keys = sum(list(tied_params.values()), []) + list(tied_params.keys())
+    else:
+        tied_keys = sum(tied_params, [])
+    has_tied_params = len(tied_keys) > 0
+
+    # If there is not tied weights, we want to keep the lm_head（output_embedding) in full precision
+    if not has_tied_params:
+        output_emb = model.get_output_embeddings()
+        if output_emb is not None:
+            list_last_module = [name for name, module in model.named_modules() if id(module) == id(output_emb)]
+            return list_last_module
+
+    # otherwise, no tied weights, no output embedding defined, simply keep the last module in full precision
+    list_modules = list(model.named_parameters())
+    list_last_module = [list_modules[-1][0]]
+    # add last module together with tied weights
+    intersection = set(list_last_module) - set(tied_keys)
+    list_untouched = list(set(tied_keys)) + list(intersection)
+
+    # remove ".weight" from the keys
+    names_to_remove = [".weight", ".bias"]
+    filtered_module_names = []
+    for name in list_untouched:
+        for name_to_remove in names_to_remove:
+            if name_to_remove in name:
+                name = name.replace(name_to_remove, "")
+        filtered_module_names.append(name)
+
+    return filtered_module_names
+
+
+# Copied from PEFT: https://github.com/huggingface/peft/blob/47b3712898539569c02ec5b3ed4a6c36811331a1/src/peft/utils/integrations.py#L41
+def dequantize_bnb_weight(weight: "torch.nn.Parameter", dtype: "torch.dtype", state=None):
+    """
+    Helper function to dequantize 4bit or 8bit bnb weights.
+
+    If the weight is not a bnb quantized weight, it will be returned as is.
+    """
+    if not isinstance(weight, torch.nn.Parameter):
+        raise TypeError(f"Input weight should be of type nn.Parameter, got {type(weight)} instead")
+
+    cls_name = weight.__class__.__name__
+    if cls_name not in ("Params4bit", "Int8Params"):
+        return weight
+
+    if cls_name == "Params4bit":
+        output_tensor = bnb.functional.dequantize_4bit(weight.data, weight.quant_state)
+        logger.warning_once(
+            f"The model is going to be dequantized in {output_tensor.dtype} - if you want to upcast it to another dtype, make sure to pass the desired dtype when quantizing the model through `bnb_4bit_quant_type` argument of `BitsAndBytesConfig`"
+        )
+        return output_tensor.to(dtype)
+
+    if state.SCB is None:
+        state.SCB = weight.SCB
+
+    if hasattr(bnb.functional, "int8_vectorwise_dequant"):
+        # Use bitsandbytes API if available (requires v0.45.0+)
+        dequantized = bnb.functional.int8_vectorwise_dequant(weight.data, state.SCB)
+    else:
+        # Multiply by (scale/127) to dequantize.
+        dequantized = weight.data * state.SCB.view(-1, 1) * 7.874015718698502e-3
+
+    return dequantized.to(dtype)
+
+
+def _create_accelerate_new_hook(old_hook):
+    r"""
+    Creates a new hook based on the old hook. Use it only if you know what you are doing !
+    This method is a copy of: https://github.com/huggingface/peft/blob/748f7968f3a31ec06a1c2b0328993319ad9a150a/src/peft/utils/other.py#L245
+    with some changes
+    """
+    old_hook_cls = getattr(accelerate.hooks, old_hook.__class__.__name__)
+    old_hook_attr = old_hook.__dict__
+    filtered_old_hook_attr = {}
+    old_hook_init_signature = inspect.signature(old_hook_cls.__init__)
+    for k in old_hook_attr:
+        if k in old_hook_init_signature.parameters:
+            filtered_old_hook_attr[k] = old_hook_attr[k]
+    new_hook = old_hook_cls(**filtered_old_hook_attr)
+    return new_hook
+
+
+def _dequantize_and_replace(
+    model,
+    dtype,
+    modules_to_not_convert=None,
+    current_key_name=None,
+    quantization_config=None,
+    has_been_replaced=False,
+):
+    """
+    Converts a quantized model into its dequantized original version. The newly converted model will have
+    some performance drop compared to the original model before quantization - use it only for specific usecases
+    such as QLoRA adapters merging.
+
+    Returns the converted model and a boolean that indicates if the conversion has been successful or not.
+    """
+    quant_method = quantization_config.quantization_method()
+
+    target_cls = bnb.nn.Linear8bitLt if quant_method == "llm_int8" else bnb.nn.Linear4bit
+
+    for name, module in model.named_children():
+        if current_key_name is None:
+            current_key_name = []
+        current_key_name.append(name)
+
+        if isinstance(module, target_cls) and name not in modules_to_not_convert:
+            # Check if the current key is not in the `modules_to_not_convert`
+            current_key_name_str = ".".join(current_key_name)
+
+            if not any(
+                (key + "." in current_key_name_str) or (key == current_key_name_str) for key in modules_to_not_convert
+            ):
+                bias = getattr(module, "bias", None)
+
+                device = module.weight.device
+                with init_empty_weights():
+                    new_module = torch.nn.Linear(module.in_features, module.out_features, bias=bias is not None)
+
+                if quant_method == "llm_int8":
+                    state = module.state
+                else:
+                    state = None
+
+                new_module.weight = torch.nn.Parameter(dequantize_bnb_weight(module.weight, dtype, state))
+
+                if bias is not None:
+                    new_module.bias = bias
+
+                # Create a new hook and attach it in case we use accelerate
+                if hasattr(module, "_hf_hook"):
+                    old_hook = module._hf_hook
+                    new_hook = _create_accelerate_new_hook(old_hook)
+
+                    remove_hook_from_module(module)
+                    add_hook_to_module(new_module, new_hook)
+
+                new_module.to(device)
+                model._modules[name] = new_module
+                has_been_replaced = True
+        if len(list(module.children())) > 0:
+            _, has_been_replaced = _dequantize_and_replace(
+                module,
+                dtype,
+                modules_to_not_convert,
+                current_key_name,
+                quantization_config,
+                has_been_replaced=has_been_replaced,
+            )
+        # Remove the last key for recursion
+        current_key_name.pop(-1)
+    return model, has_been_replaced
+
+
+def dequantize_and_replace(
+    model,
+    modules_to_not_convert=None,
+    quantization_config=None,
+):
+    model, has_been_replaced = _dequantize_and_replace(
+        model,
+        model.dtype,
+        modules_to_not_convert=modules_to_not_convert,
+        quantization_config=quantization_config,
+    )
+
+    if not has_been_replaced:
+        logger.warning(
+            "For some reason the model has not been properly dequantized. You might see unexpected behavior."
+        )
+
+    return model
+
+
+def _validate_bnb_multi_backend_availability(raise_exception):
+    import bitsandbytes as bnb
+
+    bnb_supported_devices = getattr(bnb, "supported_torch_devices", set())
+    available_devices = set(get_available_devices())
+
+    if not available_devices.intersection(bnb_supported_devices):
+        if raise_exception:
+            err_msg = (
+                f"None of the available devices `available_devices = {available_devices or None}` are supported by the bitsandbytes version you have installed: `bnb_supported_devices = {bnb_supported_devices}`. "
+                "Please check the docs to see if the backend you intend to use is available and how to install it: https://huggingface.co/docs/bitsandbytes/main/en/installation"
+            )
+
+            logger.error(err_msg)
+            raise RuntimeError(err_msg)
+
+        logger.warning("No supported devices found for bitsandbytes multi-backend.")
+        return False
+
+    logger.debug("Multi-backend validation successful.")
+    return True
+
+
+def _validate_bnb_cuda_backend_availability(raise_exception):
+    if not is_torch_available():
+        return False
+
+    import torch
+
+    if not torch.cuda.is_available():
+        log_msg = (
+            "CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. "
+            "Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend"
+        )
+        if raise_exception:
+            logger.error(log_msg)
+            raise RuntimeError(log_msg)
+
+        logger.warning(log_msg)
+        return False
+
+    logger.debug("CUDA backend validation successful.")
+    return True
+
+
+def validate_bnb_backend_availability(raise_exception=False):
+    """
+    Validates if the available devices are supported by bitsandbytes, optionally raising an exception if not.
+    """
+    if not is_bitsandbytes_available():
+        if importlib.util.find_spec("bitsandbytes") and version.parse(
+            importlib.metadata.version("bitsandbytes")
+        ) < version.parse("0.43.1"):
+            return _validate_bnb_cuda_backend_availability(raise_exception)
+        return False
+
+    if is_bitsandbytes_multi_backend_available():
+        return _validate_bnb_multi_backend_availability(raise_exception)
+    return _validate_bnb_cuda_backend_availability(raise_exception)
diff --git a/venv/lib/python3.13/site-packages/transformers/integrations/executorch.py b/venv/lib/python3.13/site-packages/transformers/integrations/executorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..85343bd02fe3bd032b459ce6a02b953ed20cd6c5
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/integrations/executorch.py
@@ -0,0 +1,1201 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import logging
+from typing import Callable, Optional
+
+import torch
+
+from ..cache_utils import (
+    DynamicCache,
+    DynamicLayer,
+    DynamicSlidingWindowLayer,
+    EncoderDecoderCache,
+    StaticCache,
+)
+from ..generation.configuration_utils import GenerationConfig
+from ..masking_utils import (
+    ALL_MASK_ATTENTION_FUNCTIONS,
+    _ignore_causal_mask_sdpa,
+    _is_torch_greater_or_equal_than_2_5,
+    prepare_padding_mask,
+)
+from ..modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ..pytorch_utils import (
+    is_torch_greater_or_equal,
+    is_torch_greater_or_equal_than_2_3,
+    is_torch_greater_or_equal_than_2_6,
+)
+
+
+class TorchExportableModuleForVLM:
+    """
+    A wrapper class for exporting Vision-Language Models (VLMs) like SmolVLM2 for ExecuTorch.
+
+    This class handles the export of three main components:
+        1. Vision encoder (processes images to visual features)
+        2. Connector/projector (maps visual features to text embedding space)
+        3. Text decoder (generates text from combined visual and text tokens)
+    """
+
+    def __init__(self, model, max_batch_size: int = 1, max_cache_len: int = 1024):
+        """
+        Initialize the exportable VLM module.
+
+        Args:
+            model: The VLM (e.g. SmolVLM) model instance
+            max_batch_size: Maximum batch size. Always 1 for ExecuTorch
+            max_cache_len: Maximum cache length for text generation
+        """
+        self.model = model
+        self.max_batch_size = max_batch_size
+        self.max_cache_len = max_cache_len
+        self.config = model.config
+
+        # Extract individual components
+        self.vision_encoder = model.model.vision_model
+        self.connector = model.model.connector
+        self.text_decoder = model.model.text_model
+
+        # Store exported programs
+        self.exported_vision_encoder = None
+        self.exported_connector = None
+        self.exported_text_decoder = None
+
+    def export_vision_encoder(self):
+        """Export the vision encoder component."""
+        self.vision_encoder.eval()
+
+        # Create example input
+        pixel_values = torch.randn(1, 3, 384, 384, dtype=torch.float32)
+
+        # Define dynamic shapes
+        dynamic_shapes = {
+            "pixel_values": {
+                2: torch.export.Dim.AUTO,
+                3: torch.export.Dim.AUTO,
+            }
+        }
+
+        self.exported_vision_encoder = torch.export.export(
+            self.vision_encoder,
+            args=(pixel_values,),
+            dynamic_shapes=dynamic_shapes,
+            strict=False,
+        )
+
+        return self.exported_vision_encoder
+
+    def export_connector(self):
+        """Export the connector component."""
+        self.connector.eval()
+
+        # Vision encoder output shape: [batch_size, num_patches, vision_hidden_size]
+        vision_hidden_size = self.config.vision_config.hidden_size
+        image_size = self.config.vision_config.image_size
+        patch_size = self.config.vision_config.patch_size
+        patches_per_dim = image_size // patch_size
+        num_patches = patches_per_dim * patches_per_dim
+        image_hidden_states = torch.randn(1, num_patches, vision_hidden_size, dtype=torch.float32)
+
+        # Define dynamic shapes - static batch_size=1, dynamic num_patches
+        dynamic_shapes = {"image_hidden_states": {1: torch.export.Dim.AUTO}}
+
+        # Export the connector using torch.export
+        self.exported_connector = torch.export.export(
+            self.connector,
+            args=(image_hidden_states,),
+            dynamic_shapes=dynamic_shapes,
+            strict=False,
+        )
+
+        return self.exported_connector
+
+    def export_text_decoder(self):
+        """Export the text decoder component."""
+
+        # Create text decoder exportable wrapper
+        self.exportable_text_decoder = TorchExportableModuleForDecoderOnlyLM(model=self.text_decoder)
+
+        # Use the existing text decoder exportable wrapper
+        seq_length = 3
+        input_ids = torch.zeros((1, seq_length), dtype=torch.long)
+        cache_position = torch.arange(seq_length, dtype=torch.long)
+        max_seq_length = min(self.max_cache_len, self.config.text_config.max_position_embeddings)
+        seq_len_dim = torch.export.Dim("seq_length_dim", max=max_seq_length - 1)
+
+        dynamic_shapes = {
+            "input_ids": {1: seq_len_dim},
+            "cache_position": {0: seq_len_dim},
+        }
+
+        self.exported_text_decoder = self.exportable_text_decoder.export(
+            input_ids=input_ids,
+            cache_position=cache_position,
+            dynamic_shapes=dynamic_shapes,
+            strict=False,
+        )
+
+        return self.exported_text_decoder
+
+    def export(self, **kwargs):
+        """Export all components of the VLM model."""
+        self.export_vision_encoder(**kwargs)
+        self.export_connector(**kwargs)
+        self.export_text_decoder(**kwargs)
+        return {
+            "vision_encoder": self.exported_vision_encoder,
+            "connector": self.exported_connector,
+            "text_decoder": self.exported_text_decoder,
+        }
+
+    def forward(self, pixel_values, input_ids, cache_position):
+        """
+        Simplified forward pass for inference with guaranteed non-null input_ids and cache_position.
+
+        Args:
+            pixel_values: Input images [1, channels, height, width] (optional)
+            input_ids: Text token IDs [1, seq_len] (required - won't be None)
+            cache_position: Cache positions [seq_len] (required - won't be None)
+
+        Returns:
+            Output with logits for text generation
+        """
+        pass
+
+    def generate(
+        self, pixel_values=None, input_ids=None, max_new_tokens=50, do_sample=False, temperature=1.0, **kwargs
+    ):
+        """
+        Simplified generate method with guaranteed non-null input_ids.
+
+        Args:
+            pixel_values: Input images [1, channels, height, width] (optional)
+            input_ids: Initial text tokens [1, seq_len] (required - won't be None)
+            max_new_tokens: Maximum number of tokens to generate
+            do_sample: Whether to use sampling or greedy decoding
+            temperature: Temperature for sampling
+
+        Returns:
+            Generated sequences
+        """
+        pass
+
+
+class TorchExportableModuleForDecoderOnlyLM(torch.nn.Module):
+    """
+    A recipe module designed to make a `PreTrainedModel` exportable with `torch.export`,
+    specifically for decoder-only LM with cache. This module ensures that the
+    exported model is compatible with further lowering and execution in `ExecuTorch`.
+    """
+
+    def __init__(
+        self,
+        model: PreTrainedModel,
+        batch_size: Optional[int] = None,
+        max_cache_len: Optional[int] = None,
+        device: Optional[torch.device] = None,
+    ) -> None:
+        """
+        Initializes the exportable module.
+
+        Args:
+            model (`PreTrainedModel`): The pretrained model to wrap.
+
+        Raises:
+            ValueError: If the model is configured with a unsupported cache implementation.
+        """
+        super().__init__()
+
+        config = model.config.get_text_config()
+
+        if not hasattr(config, "use_cache") or config.use_cache is False:
+            raise ValueError("The model must have caching enabled to be performant.")
+
+        if hasattr(config, "layer_types") and getattr(config, "sliding_window", None) is not None:
+            self.model = TorchExportableModuleWithHybridCache(model, batch_size, max_cache_len, device)
+        else:
+            # If `layer_types` is not specified explicitly in the config or `sliding_window` is null,
+            # there is only 1 type of layers, so export will use `StaticCache` by default.
+            logging.info(
+                "Using `StaticCache` for export as `layer_types` is not specified or `sliding_window` is `null` in the config."
+            )
+            self.model = TorchExportableModuleWithStaticCache(model, batch_size, max_cache_len, device)
+        # This is the same as sdpa, but mask creation does not use `vmap` which is not exportable
+        ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa_without_vmap", sdpa_mask_without_vmap)
+        ALL_ATTENTION_FUNCTIONS.register("sdpa_without_vmap", ALL_ATTENTION_FUNCTIONS["sdpa"])
+        self.model.model.config._attn_implementation = "sdpa_without_vmap"
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Forward pass of the module, which is compatible with the ExecuTorch llm runner.
+
+        Args:
+            input_ids (`torch.Tensor`): Tensor representing current input token id to the module.
+            inputs_embeds (`torch.Tensor`): Tensor representing current input embeddings to the module.
+            cache_position (`torch.Tensor`): Tensor representing current input position in the cache.
+
+        Returns:
+            torch.Tensor: Logits output from the model.
+        """
+        return self.model.forward(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+        )
+
+    def export(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        dynamic_shapes: Optional[dict] = None,
+        strict: Optional[bool] = None,
+    ) -> torch.export.ExportedProgram:
+        """
+        Export the wrapped module using `torch.export`.
+
+        Args:
+            input_ids (`Optional[torch.Tensor]`):
+                Tensor representing current input token id to the module. Must specify either this or inputs_embeds.
+            inputs_embeds (`Optional[torch.Tensor]`):
+                Tensor representing current input embeddings to the module. Must specify either this or input_ids.
+            cache_position (`Optional[torch.Tensor]`):
+                Tensor representing current input position in the cache. If not provided, a default tensor will be used.
+            dynamic_shapes (`Optional[dict]`):
+                Dynamic shapes to use for export if specified.
+            strict(`Optional[bool]`):
+                Flag to instruct `torch.export` to use `torchdynamo`.
+
+        Returns:
+            torch.export.ExportedProgram: The exported program that can be used for inference.
+
+        Examples:
+            Export with input_ids:
+            ```python
+            # Prepare inputs
+            input_ids = torch.tensor([[1, 2, 3]], dtype=torch.long, device=model.device)
+            cache_position = torch.arange(input_ids.shape[-1], dtype=torch.long, device=model.device)
+
+            # Export
+            exported = exportable_module.export(
+                input_ids=input_ids,
+                cache_position=cache_position
+            )
+            ```
+
+            Export with inputs_embeds:
+            ```python
+            # Prepare embeddings
+            inputs_embeds = torch.randn(1, 3, 768, device=model.device)  # batch_size=1, seq_len=3, hidden_size=768
+            cache_position = torch.arange(inputs_embeds.shape[1], dtype=torch.long, device=model.device)
+
+            # Export
+            exported = exportable_module.export(
+                inputs_embeds=inputs_embeds,
+                cache_position=cache_position
+            )
+            ```
+        """
+        if not (input_ids is None) ^ (inputs_embeds is None):
+            raise ValueError("Need to specify either input_ids or inputs_embeds.")
+
+        if hasattr(self.model, "base_model_prefix"):
+            base = getattr(self.model, self.model.base_model_prefix, self.model)
+            model_device = base.device
+        elif hasattr(self.model, "model"):
+            model_device = self.model.model.device
+        else:
+            model_device = "cpu"
+            logging.warning(
+                "TorchExportableModuleForDecoderOnlyLM.export Can't infer device from the model. Set to CPU by default."
+            )
+
+        if input_ids is not None:
+            input_kwargs = {
+                "input_ids": input_ids,
+                "cache_position": cache_position
+                if cache_position is not None
+                else torch.arange(input_ids.shape[-1], dtype=torch.long, device=model_device),
+            }
+        else:  # inputs_embeds
+            input_kwargs = {
+                "inputs_embeds": inputs_embeds,
+                "cache_position": cache_position
+                if cache_position is not None
+                else torch.arange(inputs_embeds.shape[1], dtype=torch.long, device=model_device),
+            }
+
+        exported_program = torch.export.export(
+            self.model,
+            args=(),
+            kwargs=input_kwargs,
+            dynamic_shapes=dynamic_shapes,
+            strict=strict if strict is not None else True,
+        )
+
+        return exported_program
+
+    @staticmethod
+    def generate(
+        exported_program: torch.export.ExportedProgram,
+        tokenizer,
+        prompt: str,
+        max_new_tokens: int = 20,
+        do_sample: bool = False,
+        temperature: float = 1.0,
+        top_k: int = 50,
+        top_p: float = 1.0,
+        device: str = "cpu",
+    ) -> str:
+        """
+        Generate a sequence of tokens using an exported program.
+
+        Args:
+            exported_program (`torch.export.ExportedProgram`): The exported model being used for generate.
+            tokenizer: The tokenizer to use.
+            prompt (str): The input prompt.
+            max_new_tokens (int): Maximum number of new tokens to generate.
+            do_sample (bool): Whether to use sampling or greedy decoding.
+            temperature (float): The temperature for sampling.
+            top_k (int): The number of highest probability tokens to keep for top-k sampling.
+            top_p (float): The cumulative probability for nucleus sampling.
+            device (str): The device to use.
+
+        Returns:
+            str: The generated text.
+        """
+        # Get the module from the exported program
+        exported_module = exported_program.module()
+
+        # Tokenize the prompt
+        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
+
+        # Initialize with the prompt
+        generated_ids = input_ids.clone()
+
+        # Process the prompt tokens first
+        curr_position = 0
+        for i in range(input_ids.shape[1]):
+            # Process one token at a time
+            curr_input_ids = input_ids[:, i : i + 1]
+            curr_cache_position = torch.tensor([curr_position], dtype=torch.long, device=device)
+
+            # Forward pass
+            _ = exported_module(input_ids=curr_input_ids, cache_position=curr_cache_position)
+            curr_position += 1
+
+        # Generate new tokens
+        for _ in range(max_new_tokens):
+            # Get the last token as input
+            curr_input_ids = generated_ids[:, -1:]
+            curr_cache_position = torch.tensor([curr_position], dtype=torch.long, device=device)
+
+            # Forward pass to get next token logits
+            outputs = exported_module(input_ids=curr_input_ids, cache_position=curr_cache_position)
+
+            # Get the next token ID
+            if do_sample:
+                # Apply temperature
+                if temperature > 0:
+                    logits = outputs / temperature
+                else:
+                    logits = outputs
+
+                # Apply top-k filtering
+                if top_k > 0:
+                    indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+                    logits[indices_to_remove] = float("-inf")
+
+                # Apply top-p (nucleus) filtering
+                if top_p < 1.0:
+                    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+                    cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
+
+                    # Remove tokens with cumulative probability above the threshold
+                    sorted_indices_to_remove = cumulative_probs > top_p
+                    # Shift the indices to the right to keep also the first token above the threshold
+                    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+                    sorted_indices_to_remove[..., 0] = 0
+
+                    # Scatter sorted tensors to original indexing
+                    indices_to_remove = sorted_indices_to_remove.scatter(-1, sorted_indices, sorted_indices_to_remove)
+                    logits[indices_to_remove] = float("-inf")
+
+                # Sample from the filtered distribution
+                probs = torch.softmax(logits, dim=-1)
+                next_token_id = torch.multinomial(probs, num_samples=1)
+            else:
+                # Greedy decoding
+                next_token_id = outputs.argmax(dim=-1, keepdim=True)
+
+            # Ensure next_token_id has the right shape before concatenation
+            if next_token_id.dim() > 2:
+                next_token_id = next_token_id.squeeze(-1)
+
+            # Append to the generated sequence
+            generated_ids = torch.cat([generated_ids, next_token_id], dim=-1)
+            curr_position += 1
+
+            # Stop if we generate an EOS token
+            if next_token_id.item() == tokenizer.eos_token_id:
+                break
+
+        # Decode the generated text
+        return tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+
+
+class TorchExportableModuleWithStaticCache(torch.nn.Module):
+    """
+    A recipe module designed to make a `PreTrainedModel` exportable with `torch.export`,
+    specifically for decoder-only LM to `StaticCache`. This module ensures that the
+    exported model is compatible with further lowering and execution in `ExecuTorch`.
+
+    Note:
+        This class is specifically designed to support export process using `torch.export`
+        in a way that ensures the model can be further lowered and run efficiently in `ExecuTorch`.
+    """
+
+    def __init__(
+        self,
+        model: PreTrainedModel,
+        batch_size: Optional[int] = None,
+        max_cache_len: Optional[int] = None,
+        device: Optional[torch.device] = None,
+    ) -> None:
+        """
+        Initializes the wrapper module with the pretrained model.
+
+        Args:
+            model (`PreTrainedModel`): The pretrained model to wrap. The model must have caching
+                enabled and use a 'static' caching implementation.
+            batch_size (`Optional[int]`): The batch size of the model. If not provided, we check if a value can be found
+                in `generation_config.cache_config` and otherwise we raise a ValueError.
+            max_cache_len (`Optional[int]`): The maximum cache length for generation. Same mechanism as `batch_size` if
+                not provided.
+            device (`Optional[torch.device]`): The device to use. If not provided, we check if a value can be found
+                in `generation_config.cache_config` and otherwise we use `model.device` (no error is raised).
+
+        Raises:
+            AssertionError: If the pretrained model does not have caching enabled or if it does
+            not use a 'static' caching implementation in `model.generation_config`.
+            ValueError: If `batch_size` or `max_cache_len` is not provided, either as an argument or in `cache_config`.
+        """
+        super().__init__()
+
+        config = model.config.get_text_config()
+        generation_config = model.generation_config
+
+        # Sanity checks
+        if generation_config is None:
+            raise AssertionError(
+                "The model must have a generation config to be exported with static caching. "
+                "Please set `generation_config` in `model`."
+            )
+        if not generation_config.use_cache:
+            raise AssertionError(
+                "The model must have caching enabled to be exported with static caching. "
+                "Please set `generation_config.use_cache=True`."
+            )
+        if generation_config.cache_implementation != "static":
+            raise AssertionError(
+                "The model must use a 'static' caching implementation to be exported with static caching. "
+                "Please set `generation_config.cache_implementation='static'`."
+            )
+
+        cache_config = {} if generation_config.cache_config is None else generation_config.cache_config
+
+        # Ensure batch_size and max_cache_len are set
+        if batch_size is None:
+            batch_size = cache_config.get("batch_size", None)
+            if batch_size is None:
+                raise ValueError("batch_size must be provided, either as an argument or in cache_config.")
+        if max_cache_len is None:
+            max_cache_len = cache_config.get("max_cache_len", None)
+            if max_cache_len is None:
+                raise ValueError("max_cache_len must be provided, either as an argument or in cache_config.")
+        # Infer device if not provided
+        if device is None:
+            device = cache_config.get("device", model.device)
+
+        # Initialize the static cache
+        self.model = model
+        self.static_cache = StaticCache(max_cache_len=max_cache_len, config=config)
+        head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        num_heads = getattr(config, "num_key_value_heads", config.num_attention_heads)
+        dtype = self.model.dtype
+        # We need this call to initialize all the layers (otherwise it's done lazily, which is not exportable)
+        self.static_cache.early_initialization(batch_size, num_heads, head_dim, dtype, device)
+        for i in range(len(self.static_cache)):
+            self.register_buffer(f"key_cache_{i}", self.static_cache.layers[i].keys, persistent=False)
+            self.register_buffer(f"value_cache_{i}", self.static_cache.layers[i].values, persistent=False)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ):
+        """
+        Forward pass of the module, which is compatible with the ExecuTorch runtime.
+
+        Args:
+            input_ids (`torch.Tensor`): Tensor representing current input token id to the module.
+            inputs_embeds (`torch.Tensor`): Tensor representing current input embeddings to the module.
+            cache_position (`torch.Tensor`): Tensor representing current input position in the cache.
+
+        Returns:
+            torch.Tensor: Logits output from the model.
+
+        This forward adapter serves two primary purposes:
+
+        1. **Making the Model `torch.export`-Compatible**:
+            The adapter hides unsupported objects, such as the `Cache`, from the graph inputs and outputs,
+            enabling the model to be exportable using `torch.export` without encountering issues.
+
+        2. **Ensuring Compatibility with `ExecuTorch` runtime**:
+            The adapter matches the model's forward signature with that in `executorch/extension/llm/runner`,
+            ensuring that the exported model can be executed in `ExecuTorch` out-of-the-box.
+        """
+        past_key_values = self.static_cache
+
+        outs = self.model(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            attention_mask=None,
+            past_key_values=past_key_values,
+            use_cache=True,
+        )
+        if hasattr(outs, "logits"):
+            # Returned outputs is `CausalLMOutputWithPast`
+            return outs.logits
+        else:
+            # Returned the `last_hidden_state` from `BaseModelOutputWithPast`
+            return outs.last_hidden_state
+
+    @staticmethod
+    def generate(
+        exported_program: torch.export.ExportedProgram,
+        prompt_token_ids: torch.Tensor,
+        max_new_tokens: int,
+    ) -> torch.Tensor:
+        """
+        Generate a sequence of tokens using an exported program.
+
+        This util function is designed to test exported models by simulating the generation process.
+        It processes the input prompt tokens sequentially (no parallel prefill).
+        This generate function is not intended to replace the original `generate` method, and the support
+        for leveraging the original `generate` is potentially planned!
+
+        Args:
+            exported_program (`torch.export.ExportedProgram`): The exported program generated via `torch.export`.
+            prompt_token_ids (`torch.Tensor`): Tensor representing the input prompt token IDs.
+            max_new_tokens (`int`): Maximum number of new tokens to generate. Note that the total generation
+                length is limited by both `max_new_tokens` and the model's cache size.
+
+        Returns:
+            torch.Tensor: A tensor containing the generated sequence of token IDs, including the original prompt tokens.
+        """
+        device = prompt_token_ids.device
+        prompt_token_len = prompt_token_ids.shape[-1]
+        max_generation_length = prompt_token_len + max_new_tokens
+        for buffer_name, buffer in exported_program.named_buffers():
+            if buffer_name.startswith("key_cache"):
+                max_cache_len = buffer.shape[2]
+                max_generation_length = min(max_generation_length, max_cache_len)
+                break
+
+        response_tokens = []
+        for input_pos in range(min(max_generation_length, prompt_token_len)):
+            result = exported_program.module().forward(
+                input_ids=prompt_token_ids[:, input_pos : input_pos + 1],
+                cache_position=torch.tensor([input_pos], dtype=torch.long, device=device),
+            )
+            response_tokens.append(prompt_token_ids[0][input_pos].item())
+
+        current_token = torch.argmax(result[:, -1, :], dim=-1).item()
+        response_tokens.append(current_token)
+
+        while len(response_tokens) < max_generation_length:
+            result = exported_program.module().forward(
+                input_ids=torch.tensor([[current_token]], dtype=torch.long, device=device),
+                cache_position=torch.tensor([len(response_tokens)], dtype=torch.long, device=device),
+            )
+            current_token = torch.argmax(result[:, -1, :], dim=-1).item()
+            response_tokens.append(current_token)
+
+        return torch.tensor([response_tokens], dtype=torch.long, device=device)
+
+
+class TorchExportableModuleWithHybridCache(torch.nn.Module):
+    """
+    A recipe module designed to make a `PreTrainedModel` exportable with `torch.export`,
+    specifically for decoder-only LM to hybrid `StaticCache`. This module ensures that the
+    exported model is compatible with further lowering and execution in `ExecuTorch`.
+    """
+
+    def __init__(
+        self,
+        model: PreTrainedModel,
+        batch_size: Optional[int] = None,
+        max_cache_len: Optional[int] = None,
+        device: Optional[torch.device] = None,
+    ) -> None:
+        """
+        Initializes the exportable module.
+
+        Args:
+            model (`PreTrainedModel`): The pretrained model to wrap.
+            batch_size (`Optional[int]`): The batch size of the model. If not provided, we check if a value can be found
+                in `generation_config.cache_config` and otherwise we raise a ValueError.
+            max_cache_len (`Optional[int]`): The maximum cache length for generation. Same mechanism as `batch_size` if
+                not provided.
+            device (`Optional[torch.device]`): The device to use. If not provided, we check if a value can be found
+                in `generation_config.cache_config` and otherwise we use `model.device` (no error is raised).
+        Raises:
+            AssertionError: If the model doesn't have the expected configuration for hybrid StaticCache.
+            ValueError: If `batch_size` or `max_cache_len` is not provided, either as an argument or in `cache_config`.
+        """
+        super().__init__()
+        self.model = model
+        config = model.config.get_text_config()
+        generation_config = model.generation_config
+
+        # Sanity checks
+        if generation_config is None:
+            raise AssertionError(
+                "The model must have a generation config to be exported with static caching. "
+                "Please set `generation_config` in `model`."
+            )
+        if not config.use_cache:
+            raise AssertionError("Model must have caching enabled.")
+
+        cache_config = {} if generation_config.cache_config is None else generation_config.cache_config
+        # Ensure batch_size and max_cache_len are set
+        if batch_size is None:
+            batch_size = cache_config.get("batch_size", None)
+            if batch_size is None:
+                raise ValueError("batch_size must be provided, either as an argument or in cache_config.")
+        if max_cache_len is None:
+            max_cache_len = cache_config.get("max_cache_len", None)
+            if max_cache_len is None:
+                raise ValueError("max_cache_len must be provided, either as an argument or in cache_config.")
+        # Infer device if not provided
+        if device is None:
+            device = cache_config.get("device", model.device)
+
+        # Initialize the cache
+        self.cache = StaticCache(config=config, max_cache_len=max_cache_len)
+        head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        num_heads = getattr(config, "num_key_value_heads", config.num_attention_heads)
+        dtype = self.model.dtype
+        # We need this call to initialize all the layers (otherwise it's done lazily, which is not exportable)
+        self.cache.early_initialization(batch_size, num_heads, head_dim, dtype, device)
+
+        # Register all key and value cache tensors as buffers
+        for i in range(len(self.cache)):
+            self.register_buffer(f"key_cache_{i}", self.cache.layers[i].keys, persistent=False)
+            self.register_buffer(f"value_cache_{i}", self.cache.layers[i].values, persistent=False)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Forward pass of the module, which is compatible with the ExecuTorch llm runner.
+
+        Args:
+            input_ids (`torch.Tensor`): Tensor representing current input token id to the module.
+            inputs_embeds (`Optional[torch.Tensor]`): Tensor representing current input embeddings to the module.
+            cache_position (`torch.Tensor`): Tensor representing current input position in the cache.
+
+        Returns:
+            torch.Tensor: Logits output from the model.
+        """
+        # Forward pass with the model
+        outputs = self.model(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            attention_mask=None,
+            past_key_values=self.cache,
+            use_cache=True,
+        )
+
+        # Return only the logits to simplify the export
+        return outputs.logits
+
+
+def convert_and_export_with_cache(
+    model: PreTrainedModel,
+    example_input_ids: Optional[torch.Tensor] = None,
+    example_cache_position: Optional[torch.Tensor] = None,
+    dynamic_shapes: Optional[dict] = None,
+    strict: Optional[bool] = None,
+):
+    """
+    Convert a `PreTrainedModel` into an exportable module and export it using `torch.export`,
+    ensuring the exported model is compatible with `ExecuTorch`.
+
+    Args:
+        model (`PreTrainedModel`): The pretrained model to be exported.
+        example_input_ids (`Optional[torch.Tensor]`): Example input token id used by `torch.export`.
+        example_cache_position (`Optional[torch.Tensor]`): Example current cache position used by `torch.export`.
+        dynamic_shapes(`Optional[dict]`): Dynamic shapes used by `torch.export`.
+        strict(`Optional[bool]`): Flag to instruct `torch.export` to use `torchdynamo`.
+
+    Returns:
+        Exported program (`torch.export.ExportedProgram`): The exported program generated via `torch.export`.
+    """
+    if not is_torch_greater_or_equal_than_2_3:
+        raise ImportError("torch >= 2.3 is required.")
+
+    import torch.export._trace
+
+    # This is the same as sdpa, but mask creation does not use `vmap` which is not exportable
+    ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa_without_vmap", sdpa_mask_without_vmap)
+    ALL_ATTENTION_FUNCTIONS.register("sdpa_without_vmap", ALL_ATTENTION_FUNCTIONS["sdpa"])
+    model.config._attn_implementation = "sdpa_without_vmap"
+
+    with torch.no_grad():
+        # TODO: The default inputs only work for text models. We need to add support for vision/audio models.
+        example_input_ids = (
+            example_input_ids
+            if example_input_ids is not None
+            else torch.tensor([[1]], dtype=torch.long, device=model.device)
+        )
+        example_cache_position = (
+            example_cache_position
+            if example_cache_position is not None
+            else torch.tensor([0], dtype=torch.long, device=model.device)
+        )
+
+        if is_torch_greater_or_equal("2.6.0"):
+            exported_program = torch.export.export(
+                TorchExportableModuleWithStaticCache(model),
+                args=(),
+                kwargs={"input_ids": example_input_ids, "cache_position": example_cache_position},
+                dynamic_shapes=dynamic_shapes,
+                strict=strict if strict is not None else True,
+            )
+        else:
+            if dynamic_shapes is not None:
+                logging.warning(
+                    "Dynamic shapes spec will be ignored by convert_and_export_with_cache for torch < 2.6.0."
+                )
+            if strict is not None:
+                logging.warning("The strict flag will be ignored by convert_and_export_with_cache for torch < 2.6.0.")
+            # We have to keep this path for BC.
+            #
+            # Due to issue https://github.com/pytorch/pytorch/issues/128394, we need to switch to use an internal
+            # export API and pre_dispatch=False. Switch to use the public API once the issue is included in 2.5 release.
+            exported_program = torch.export._trace._export(
+                TorchExportableModuleWithStaticCache(model),
+                args=(),
+                kwargs={"input_ids": example_input_ids, "cache_position": example_cache_position},
+                pre_dispatch=False,
+                strict=True,
+            )
+        return exported_program
+
+
+class Seq2SeqLMEncoderExportableModule(torch.nn.Module):
+    """
+    A wrapper module designed to make a Seq2Seq LM encoder exportable with `torch.export`.
+    This module ensures that the exported encoder model is compatible with ExecuTorch.
+    """
+
+    def __init__(self, encoder_model):
+        super().__init__()
+        self.encoder = encoder_model
+
+    def forward(self, input_ids):
+        return self.encoder(input_ids=input_ids).last_hidden_state
+
+
+class Seq2SeqLMDecoderExportableModuleWithStaticCache(torch.nn.Module):
+    """
+    A wrapper module designed to make a Seq2Seq LM decoder exportable with `torch.export`,
+    specifically for use with static caching. This module ensures the exported decoder
+    is compatible with ExecuTorch.
+    """
+
+    def __init__(self, model, max_static_cache_length, batch_size):
+        super().__init__()
+
+        # Get the decoder component
+        self.decoder = model.get_decoder()
+        self.lm_head = model.lm_head
+        self.config = model.config
+
+        # Detect the device of the exported models by checking a parameter
+        # We'll use the model's device as the target device
+        model_device = next(model.parameters()).device
+
+        # Initialize static cache for decoder and DynamicCache for encoder
+        self.static_cache = StaticCache(config=self.config, max_cache_len=max_static_cache_length)
+        head_dim = getattr(self.config, "head_dim", self.config.hidden_size // self.config.num_attention_heads)
+        num_heads = getattr(self.config, "num_key_value_heads", self.config.num_attention_heads)
+        self.static_cache.early_initialization(batch_size, num_heads, head_dim, torch.float32, model_device)
+        self.cache = EncoderDecoderCache(self.static_cache, DynamicCache(config=self.config))
+
+        register_dynamic_cache_export_support()
+
+        # Register cache buffers to make them exportable
+        for i in range(len(self.static_cache)):
+            self.register_buffer(f"key_cache_{i}", self.static_cache.layers[i].keys, persistent=False)
+            self.register_buffer(f"value_cache_{i}", self.static_cache.layers[i].values, persistent=False)
+
+    def forward(self, decoder_input_ids, encoder_hidden_states, cache_position):
+        # Get outputs from decoder
+        outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            past_key_values=self.cache,
+            use_cache=True,
+            cache_position=cache_position,
+        )
+
+        # Apply language model head
+        lm_logits = self.lm_head(outputs[0])
+
+        return lm_logits
+
+
+class Seq2SeqLMExportableModule(torch.nn.Module):
+    def __init__(
+        self, model, batch_size=1, max_hidden_seq_length=4096, cache_implementation="static", max_cache_length=1024
+    ):
+        super().__init__()
+
+        self.full_model = model
+        self.encoder = model.get_encoder()
+        self.config = model.config
+        self.max_hidden_seq_length = max_hidden_seq_length
+        self.generation_config = GenerationConfig(
+            use_cache=True,
+            max_length=max_cache_length,
+            cache_implementation=cache_implementation,
+            cache_config={
+                "batch_size": batch_size,
+                "max_cache_len": max_cache_length,
+            },
+        )
+        self.exported_encoder = None
+        self.exported_decoder = None
+
+    def _export_encoder(self, encoder_input_ids):
+        wrapped_encoder = Seq2SeqLMEncoderExportableModule(self.encoder).to(self.full_model.device).eval()
+
+        # Define dynamic sequence length for encoder
+        seq_len_dim = torch.export.Dim("encoder_seq_length", max=self.max_hidden_seq_length)
+
+        # Export the encoder
+        with torch.no_grad():
+            exported_encoder = torch.export.export(
+                wrapped_encoder, (encoder_input_ids,), dynamic_shapes={"input_ids": {1: seq_len_dim}}, strict=True
+            )
+
+        return exported_encoder
+
+    def _export_decoder(self, decoder_input_ids, encoder_hidden_states, cache_position):
+        target_device = self.full_model.device
+        wrapped_decoder = (
+            Seq2SeqLMDecoderExportableModuleWithStaticCache(
+                model=self.full_model,
+                max_static_cache_length=self.generation_config.cache_config.get("max_cache_len"),
+                batch_size=self.generation_config.cache_config.get("batch_size"),
+            )
+            .to(target_device)
+            .eval()
+        )
+
+        # Move input tensors to the same device as the wrapped decoder
+        decoder_input_ids = decoder_input_ids.to(target_device)
+        encoder_hidden_states = encoder_hidden_states.to(target_device)
+        cache_position = cache_position.to(target_device)
+
+        # Define dynamic dimension for encoder output sequence length
+        encoder_seq_len_dim = torch.export.Dim("encoder_hidden_seq_length", max=self.max_hidden_seq_length)
+
+        # Export the decoder
+        with torch.no_grad():
+            exported_decoder = torch.export.export(
+                wrapped_decoder,
+                (decoder_input_ids, encoder_hidden_states, cache_position),
+                dynamic_shapes={
+                    "decoder_input_ids": None,
+                    "encoder_hidden_states": {1: encoder_seq_len_dim},
+                    "cache_position": None,
+                },
+                strict=True,
+            )
+
+        return exported_decoder
+
+    def export(self, encoder_input_ids=None, decoder_input_ids=None, encoder_hidden_states=None, cache_position=None):
+        device = self.full_model.device
+        example_encoder_input_ids = (
+            encoder_input_ids
+            if encoder_input_ids is not None
+            else torch.ones((1, 10), dtype=torch.long, device=device)
+        )
+        example_decoder_input_ids = (
+            decoder_input_ids
+            if decoder_input_ids is not None
+            else torch.tensor([[0]], dtype=torch.long, device=device)
+        )  # Start token
+        example_cache_position = (
+            cache_position if cache_position is not None else torch.tensor([0], dtype=torch.long, device=device)
+        )
+        example_encoder_hidden_states = (
+            encoder_hidden_states
+            if encoder_hidden_states is not None
+            else torch.zeros(
+                (self.generation_config.cache_config.get("batch_size"), 10, self.config.d_model),
+                dtype=torch.float32,
+                device=device,
+            )
+        )
+        self.exported_encoder = self._export_encoder(example_encoder_input_ids)
+        self.exported_decoder = self._export_decoder(
+            example_decoder_input_ids, example_encoder_hidden_states, example_cache_position
+        )
+
+        # Return self to allow chaining
+        return self
+
+    def generate(self, prompt_token_ids, max_new_tokens):
+        with torch.no_grad():
+            model_device = self.full_model.device
+
+            # Move input to the model's device if it's on a different device
+            if prompt_token_ids.device != model_device:
+                prompt_token_ids = prompt_token_ids.to(model_device)
+
+            # Run encoder
+            encoder_output = self.exported_encoder.module()(prompt_token_ids)
+
+            # Initialize with start token (0 for T5) on the correct device
+            decoder_input_ids = torch.tensor([[0]], dtype=torch.long, device=model_device)
+            generated_ids = [0]
+
+            # Generate tokens one by one
+            for i in range(max_new_tokens - 1):
+                # Run decoder for next token prediction
+                logits = self.exported_decoder.module()(
+                    decoder_input_ids, encoder_output, torch.tensor([i], dtype=torch.long, device=model_device)
+                )
+
+                # Get next token
+                next_token = torch.argmax(logits[:, -1, :], dim=-1).item()
+                generated_ids.append(next_token)
+
+                # Update input for next iteration on the correct device
+                decoder_input_ids = torch.tensor([[next_token]], dtype=torch.long, device=model_device)
+
+                # Check if EOS token
+                if next_token == self.config.eos_token_id:
+                    break
+
+            return generated_ids
+
+
+def export_with_dynamic_cache(
+    model: PreTrainedModel,
+    example_input_ids: Optional[torch.Tensor] = None,
+    example_attention_mask: Optional[torch.Tensor] = None,
+):
+    """
+    Export a model with DynamicCache using `torch.export`, ensuring the exported model is compatible with `ExecuTorch`.
+
+    Args:
+        model (`PreTrainedModel`): The pretrained model to be exported.
+        example_input_ids (`Optional[torch.Tensor]`): Example input token id used by `torch.export`.
+        example_attention_mask (`Optional[torch.Tensor]`): Example attention mask used by `torch.export`.
+
+    Returns:
+        Exported program (`torch.export.ExportedProgram`): The exported program generated via `torch.export`.
+    """
+    if not is_torch_greater_or_equal_than_2_3:
+        raise ImportError("torch >= 2.3 is required.")
+
+    # This is the same as sdpa, but mask creation does not use `vmap` which is not exportable
+    ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa_without_vmap", sdpa_mask_without_vmap)
+    ALL_ATTENTION_FUNCTIONS.register("sdpa_without_vmap", ALL_ATTENTION_FUNCTIONS["sdpa"])
+    model.config._attn_implementation = "sdpa_without_vmap"
+
+    register_dynamic_cache_export_support()
+
+    with torch.no_grad():
+        exported_program = torch.export.export(
+            model,
+            (),
+            {
+                "input_ids": example_input_ids,
+                "attention_mask": example_attention_mask,
+                "past_key_values": DynamicCache(config=model.config),
+                "use_cache": True,
+            },
+            strict=False,
+        )
+        return exported_program
+
+
+def register_dynamic_cache_export_support():
+    """
+    Utilities for `DynamicCache` <> torch.export support
+    """
+
+    try:
+        torch.utils._pytree.register_pytree_node(
+            DynamicCache,
+            lambda dynamic_cache: torch.utils._pytree._dict_flatten(_get_cache_dict(dynamic_cache)),
+            _unflatten_dynamic_cache,
+            serialized_type_name=f"{DynamicCache.__module__}.{DynamicCache.__name__}",
+            flatten_with_keys_fn=lambda dynamic_cache: torch.utils._pytree._dict_flatten_with_keys(
+                _get_cache_dict(dynamic_cache)
+            ),
+        )
+        # TODO (tmanlaibaatar) This won't be needed in torch 2.7.
+        torch.fx._pytree.register_pytree_flatten_spec(
+            DynamicCache,
+            lambda cache, spec: torch.fx._pytree._dict_flatten_spec(_get_cache_dict(cache), spec),
+        )
+    # Catching this in case there are multiple runs for some test runs
+    except ValueError as e:
+        if "already registered as pytree node" not in str(e):
+            raise
+
+
+def _get_cache_dict(cache: DynamicCache):
+    """Convert cache to dictionary format for pytree operations."""
+    if any(not isinstance(layer, (DynamicLayer, DynamicSlidingWindowLayer)) for layer in cache.layers):
+        raise RuntimeError("This pytree flattening function should only be applied to DynamicCache")
+
+    if not is_torch_greater_or_equal_than_2_6:
+        logging.warning("DynamicCache + torch.export is tested on torch 2.6.0+ and may not work on earlier versions.")
+
+    return {
+        "key_cache": [layer.keys for layer in cache.layers if layer.keys is not None],
+        "value_cache": [layer.values for layer in cache.layers if layer.values is not None],
+    }
+
+
+def _unflatten_dynamic_cache(values, context: torch.utils._pytree.Context):
+    dictionary = torch.utils._pytree._dict_unflatten(values, context)
+    cache = DynamicCache()
+    # Reconstruct layers from keys and values lists
+    key_list = dictionary.get("key_cache", [])
+    value_list = dictionary.get("value_cache", [])
+    for idx in range(max(len(key_list), len(value_list))):
+        key = key_list[idx] if idx < len(key_list) else None
+        value = value_list[idx] if idx < len(value_list) else None
+        cache.update(key, value, idx)
+    return cache
+
+
+def sdpa_mask_without_vmap(
+    batch_size: int,
+    cache_position: torch.Tensor,
+    kv_length: int,
+    kv_offset: int = 0,
+    mask_function: Optional[Callable] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    local_size: Optional[int] = None,
+    allow_is_causal_skip: bool = True,
+    allow_torch_fix: bool = True,
+    **kwargs,
+) -> Optional[torch.Tensor]:
+    """
+    Create a 4D boolean mask of shape `(batch_size, 1, query_length, kv_length)` where a value of True indicates that
+    the element should take part in the attention computation, and False that it should not.
+
+    This is similar to `masking_utils.sdpa_mask` but does not use `vmap` which is incompatible with export.
+
+    Args:
+        batch_size (`int`):
+            The batch size of the input sequence.
+        cache_position (`torch.Tensor`):
+            A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
+        kv_length (`int`):
+            The size that the key and value states will have during the attention computation.
+        kv_offset (`int`, optional):
+            An optional offset to indicate at which first position the key and values states will refer to.
+        mask_function (`Callable`):
+            The mask factory function describing the mask pattern.
+        attention_mask (`torch.Tensor`, optional):
+            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length)
+        local_size (`int`, optional):
+            The size of the local attention, if we do not use full attention. This is used only if `allow_is_causal_skip=True`
+            to try to skip mask creation if possible.
+        allow_is_causal_skip (`bool`, optional):
+            Whether to allow to return `None` for the mask under conditions where we can use the `is_causal` argument in
+            `torch.sdpa` instead. Default to `True`.
+        allow_torch_fix (`bool`, optional):
+            Whether to update the mask in case a query is not attending to any tokens, to solve a bug in torch's older
+            versions. We need an arg to skip it when using eager. By default `True`.
+
+    """
+
+    q_length = cache_position.shape[0]
+    # Potentially pad the 2D mask, and slice it correctly
+    padding_mask = prepare_padding_mask(attention_mask, kv_length, kv_offset)
+
+    #  Under specific conditions, we can avoid materializing the mask, instead relying on the `is_causal` argument
+    if allow_is_causal_skip and _ignore_causal_mask_sdpa(padding_mask, q_length, kv_length, local_size):
+        return None
+
+    # Similar to `kv_arange = torch.arange(start=kv_offset, end=kv_offset + kv_length, device=cache_position.device)`
+    # but without data-dependent slicing (i.e. torch.compile friendly)
+    kv_arange = torch.arange(kv_length, device=cache_position.device)
+    kv_arange += kv_offset
+    reshaped_cache_position = cache_position.view(-1, 1)
+
+    # This is a bit hacky to know what pattern we are using, but all mask creation function actually forward
+    # the config through kwargs anyway, so it allows to rely on it
+    # Usually, the `mask_function` is the only entry-point to define the pattern - we could do for loops over it,
+    # but this is more efficient
+    sliding_window = getattr(kwargs["config"], "sliding_window", None)
+    chunk_size = getattr(kwargs["config"], "attention_chunk_size", None)
+
+    if sliding_window is not None and chunk_size is not None:
+        raise ValueError("Cannot use both `sliding_window` and `attention_chunk_size`")
+
+    # Simplest and most efficient way to obtain a causal mask
+    causal_mask = kv_arange <= reshaped_cache_position
+    # If using sliding window, add the sliding mask
+    if sliding_window is not None:
+        sliding_mask_overlay = kv_arange > reshaped_cache_position - sliding_window
+        causal_mask *= sliding_mask_overlay
+    # If using chunk attention, add the chunked mask
+    elif chunk_size is not None:
+        chunked_mask_overlay = kv_arange // chunk_size == reshaped_cache_position // chunk_size
+        causal_mask *= chunked_mask_overlay
+
+    causal_mask = causal_mask[None, None, :, :].expand(batch_size, -1, -1, -1)
+    if padding_mask is not None:
+        causal_mask = causal_mask * padding_mask[:, None, None, :]
+
+    # Due to a bug in some older torch version, we need to update the mask in case a query is not attending to any
+    # tokens (due to padding). See details in https://github.com/pytorch/pytorch/issues/110213
+    if not _is_torch_greater_or_equal_than_2_5 and allow_torch_fix:
+        causal_mask |= torch.all(~causal_mask, dim=-1, keepdim=True)
+    return causal_mask
diff --git a/venv/lib/python3.13/site-packages/transformers/integrations/flash_paged.py b/venv/lib/python3.13/site-packages/transformers/integrations/flash_paged.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d1db72a760525f2e1cf1b37d29d47a1bd5762e3
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/integrations/flash_paged.py
@@ -0,0 +1,96 @@
+from typing import Optional
+
+import torch
+
+from ..generation.continuous_batching import PagedAttentionCache
+from ..utils import is_flash_attn_2_available
+
+
+# For some reason, if we dont assign the function to a variable here, it will be garbage collected
+try:
+    if is_flash_attn_2_available():
+        from flash_attn import flash_attn_varlen_func  # noqa: F401
+
+        FLASH_ATTN_VARLEN_FUNC = flash_attn_varlen_func
+    else:
+        raise RuntimeError(
+            "Flash Attention 2 is not installed. Please refer to https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install it"
+        )
+except Exception as e:
+    msg = repr(e)
+
+    def FLASH_ATTN_VARLEN_FUNC(*args, **kwargs):
+        raise Exception(f"flash_attn_varlen_func is not available: {msg}")
+
+
+def paged_attention_forward(
+    module: torch.nn.Module,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    cache: PagedAttentionCache = None,
+    cu_seq_lens_q=None,
+    cu_seq_lens_k=None,
+    max_seqlen_q=None,
+    max_seqlen_k=None,
+    implementation=None,
+    **kwargs,
+) -> torch.Tensor:
+    r"""Perform the forward pass of attention with paged key-value cache.
+
+    This function handles the cache updates and performs the attention computation
+    using the flash_attn_varlen_func for efficient processing.
+
+    Args:
+        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
+        k: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.  but if there is a block table it can be the full k
+        v: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.  but if there is a block table it can be the full v
+        cu_seq_lens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+           of the sequences in the batch, used to index into q.
+        cu_seq_lens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+           of the sequences in the batch, used to index into kv.
+        max_seqlen_q: int. Maximum query sequence length in the batch.
+        max_seqlen_k: int. Maximum key sequence length in the batch.
+        dropout_p: float. Dropout probability.
+        softmax_scale: float. The scaling of QK^T before applying softmax.
+            Default to 1 / sqrt(headdim).
+        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
+        softcap: float. Anything > 0 activates softcapping attention.
+    """
+    sliding_window = (-1, -1) if not getattr(module, "sliding_window", False) else (module.sliding_window - 1, 0)
+    layer_type = "full_attention" if sliding_window == (-1, -1) else "sliding_attention"
+
+    # .update changes the shape of k and v from [1, num_kv_heads, seqlen_kv, head_dim] to [-1, num_kv_heads, head_dim]
+    if cache is not None:
+        k, v = cache.update(k, v, module.layer_idx, **kwargs)
+
+    # Retrieve the cumulative sequence lengths for the current layer
+    if isinstance(cu_seq_lens_k, dict):
+        cu_seq_lens_k = cu_seq_lens_k[layer_type]
+        max_seqlen_k = max_seqlen_k[layer_type]
+
+    if implementation is not None and hasattr(implementation, "flash_attn_varlen_func"):
+        flash_attn_varlen_func = implementation.flash_attn_varlen_func
+    else:
+        flash_attn_varlen_func = FLASH_ATTN_VARLEN_FUNC
+
+    custom_kwargs = {"s_aux": kwargs.get("s_aux")} if "s_aux" in kwargs else {}
+
+    attn_output = flash_attn_varlen_func(
+        q.transpose(1, 2).squeeze(0).contiguous(),
+        k.contiguous(),
+        v.contiguous(),
+        cu_seq_lens_q.to(torch.int32),
+        cu_seq_lens_k.to(torch.int32).clone(),
+        max_seqlen_q,
+        max_seqlen_k,
+        softmax_scale=module.scaling,
+        causal=True,  # kind of a must, it automatically aligns the mask for q < k
+        window_size=sliding_window,  # -1 means infinite context window
+        **custom_kwargs,
+    )
+    if isinstance(attn_output, tuple):
+        attn_output = attn_output[0]
+    return attn_output, None
diff --git a/venv/lib/python3.13/site-packages/transformers/integrations/fp_quant.py b/venv/lib/python3.13/site-packages/transformers/integrations/fp_quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ac441e36f934c77e865c5e852c5ea0f27482b5b
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/integrations/fp_quant.py
@@ -0,0 +1,50 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"FP-Quant integration file"
+
+from ..utils import (
+    is_fp_quant_available,
+)
+
+
+if is_fp_quant_available():
+    from fp_quant import FPQuantConfig as FPQuantLinearConfig
+    from fp_quant import FPQuantDtype
+
+from transformers.utils.quantization_config import FPQuantConfig
+
+
+def adapt_fp_quant_config(config: FPQuantConfig):
+    if config.forward_dtype == "mxfp4":
+        forward_dtype = FPQuantDtype.MXFP4
+    elif config.forward_dtype == "nvfp4":
+        forward_dtype = FPQuantDtype.NVFP4
+    else:
+        raise ValueError(f"Unsupported forward dtype: {config.forward_dtype}")
+
+    if config.backward_dtype == "bf16":
+        backward_dtype = FPQuantDtype.BF16
+    else:
+        raise ValueError(f"Unsupported backward dtype: {config.backward_dtype}")
+
+    return FPQuantLinearConfig(
+        forward_dtype=forward_dtype,
+        forward_method=config.forward_method,
+        backward_dtype=backward_dtype,
+        store_master_weights=config.store_master_weights,
+        hadamard_group_size=config.hadamard_group_size,
+        pseudoquantization=config.pseudoquantization,
+        transform_init=config.transform_init,
+        modules_to_not_convert=config.modules_to_not_convert,
+    )
diff --git a/venv/lib/python3.13/site-packages/transformers/integrations/fsdp.py b/venv/lib/python3.13/site-packages/transformers/integrations/fsdp.py
new file mode 100644
index 0000000000000000000000000000000000000000..240268b751242cbf6ef04648e2b4807d41f02180
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/integrations/fsdp.py
@@ -0,0 +1,53 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import os
+from typing import TYPE_CHECKING
+
+from ..utils import is_torch_available, strtobool
+
+
+if TYPE_CHECKING:
+    from torch import nn
+
+
+def is_fsdp_managed_module(module: nn.Module) -> bool:
+    if not is_torch_available():
+        return False
+
+    import torch
+
+    if not torch.distributed.is_available():
+        return False
+
+    import torch.distributed.fsdp
+
+    return isinstance(module, torch.distributed.fsdp.FullyShardedDataParallel) or getattr(
+        module, "_is_fsdp_managed_module", False
+    )
+
+
+def is_fsdp_enabled():
+    if is_torch_available():
+        import torch
+
+        return (
+            torch.distributed.is_available()
+            and torch.distributed.is_initialized()
+            and strtobool(os.environ.get("ACCELERATE_USE_FSDP", "False")) == 1
+            and strtobool(os.environ.get("FSDP_CPU_RAM_EFFICIENT_LOADING", "False")) == 1
+        )
+
+    return False
diff --git a/venv/lib/python3.13/site-packages/transformers/integrations/hqq.py b/venv/lib/python3.13/site-packages/transformers/integrations/hqq.py
new file mode 100644
index 0000000000000000000000000000000000000000..083ec53a2fd327cafae536b94dda1c05fea39e76
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/integrations/hqq.py
@@ -0,0 +1,129 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"HQQ (Half-Quadratic Quantization) integration file"
+
+from ..utils import is_hqq_available, is_torch_available, logging
+
+
+if is_torch_available():
+    import torch
+
+logger = logging.get_logger(__name__)
+
+
+# Name all modules inside the model
+def autoname_modules(model):
+    for name, module in model.named_modules():
+        module.name = name
+
+
+# Get the linear_tag from a module name. For example: model.layers.31.self_attn.k_proj -> self_attn.k_proj
+def name_to_linear_tag(name):
+    return ".".join([n for n in name.split(".") if ((n not in ["model", "layers"]) and (not n.isnumeric()))])
+
+
+# Get all linear tags available
+def get_linear_tags(model):
+    if is_hqq_available():
+        from hqq.core.quantize import HQQLinear
+
+    linear_tags = set()
+    for name, module in model.named_modules():
+        if isinstance(module, (torch.nn.Linear, HQQLinear)):
+            linear_tags.add(name_to_linear_tag(name))
+    return list(linear_tags)
+
+
+def _prepare_for_hqq_linear(model, patch_params, has_been_replaced, current_key_name=None):
+    for name, module in model.named_children():
+        if current_key_name is None:
+            current_key_name = []
+        current_key_name.append(name)
+
+        if isinstance(module, torch.nn.Linear):
+            # Get linear tag
+            linear_tag = name_to_linear_tag(module.name)
+
+            # We put the module quant_config into the nn.Linear layer so we can access it later in quantizer_hqq.create_quantized_param()
+            if linear_tag in patch_params:
+                if patch_params[linear_tag] is not None:
+                    model._modules[name].quant_config = patch_params[linear_tag]
+                    # Store the module class in case we need to transpose the weight later
+                    model._modules[name].source_cls = type(module)
+                    # Force requires grad to False to avoid unexpected errors
+                    model._modules[name].requires_grad_(False)
+
+            has_been_replaced = True
+
+            # Add these fake parameters to avoid loading fail
+            for att in ["W_q", "meta"]:
+                setattr(module, att, None)
+
+        if len(list(module.children())) > 0:
+            _, has_been_replaced = _prepare_for_hqq_linear(
+                module,
+                patch_params=patch_params,
+                has_been_replaced=has_been_replaced,
+            )
+        # Remove the last key for recursion
+        current_key_name.pop(-1)
+
+    return model, has_been_replaced
+
+
+def prepare_for_hqq_linear(model, quantization_config=None, modules_to_not_convert=None, has_been_replaced=False):
+    """
+    Prepares nn.Linear layers for HQQ quantization.
+    Since each layer type can have separate quantization parameters, we need to do the following:
+    1- tag each module with its name via autoname_modules()
+    2- Extract linear_tags (e.g. ['self_attn.q_proj', ...])
+    3- Map quantization parameters as a dictionary linear_tag -> quant_params as HQQLinear expects it, this is referred to as patch_params
+    """
+
+    modules_to_not_convert = [] if modules_to_not_convert is None else modules_to_not_convert
+
+    # Add name to module
+    autoname_modules(model)
+
+    # Get linear tags. This allows us to use different quant params to different layer types
+    linear_tags = get_linear_tags(model)
+
+    # Convert quantization_config to layer-wise config
+    skip_modules = quantization_config.skip_modules
+    quant_config = quantization_config.quant_config
+    linear_tags = list(set(linear_tags) - set(skip_modules) - set(modules_to_not_convert))
+
+    if any(key in linear_tags for key in quant_config):
+        # If the user doesn't specify a key from get_linear_tags, the layer is not quantized via (key, None)
+        patch_params = dict.fromkeys(linear_tags)
+        patch_params.update(quant_config)
+    else:
+        # Same quant_config for all layers
+        patch_params = dict.fromkeys(linear_tags, quant_config)
+
+    model, has_been_replaced = _prepare_for_hqq_linear(
+        model, patch_params=patch_params, has_been_replaced=has_been_replaced
+    )
+
+    # We store quantization config as linear_tag -> hqq quant config
+    model.config.quantization_config = {
+        "quant_config": quant_config,
+        "quant_method": quantization_config.quant_method,
+        "skip_modules": skip_modules,
+    }
+
+    if not has_been_replaced:
+        logger.warning("No linear modules were found in your model for quantization.")
+
+    return model
diff --git a/venv/lib/python3.13/site-packages/transformers/integrations/mistral.py b/venv/lib/python3.13/site-packages/transformers/integrations/mistral.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdf237645fc102ae5273358eda10354a0a8b80a2
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/integrations/mistral.py
@@ -0,0 +1,103 @@
+from tokenizers import Regex, Tokenizer, decoders, pre_tokenizers, processors
+from tokenizers.models import BPE
+
+from transformers import LlamaTokenizerFast
+from transformers.convert_slow_tokenizer import bytes_to_unicode
+
+
+class MistralConverter:
+    """
+    A general tiktoken converter.
+    """
+
+    def __init__(
+        self,
+        vocab=None,
+        pattern=r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""",
+        add_prefix_space=False,
+        additional_special_tokens=None,
+        **kwargs,
+    ):
+        self.vocab = vocab
+        self.pattern = pattern
+        self.add_prefix_space = add_prefix_space
+        self.additional_special_tokens = additional_special_tokens
+
+    def extract_vocab_merges_from_model(self, vocab: str):
+        bpe_ranks = vocab
+        byte_encoder = bytes_to_unicode()
+
+        def token_bytes_to_string(b):
+            return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")])
+
+        merges = []
+        vocab = {}
+        for idx, (token, rank) in enumerate(bpe_ranks.items()):
+            if token not in self.additional_special_tokens:
+                vocab[token_bytes_to_string(token)] = idx
+                if len(token) == 1:
+                    continue
+                local = []
+                for index in range(1, len(token)):
+                    piece_l, piece_r = token[:index], token[index:]
+                    if piece_l in bpe_ranks and piece_r in bpe_ranks and (piece_l + piece_r) in bpe_ranks:
+                        local.append((piece_l, piece_r, rank))
+                local = sorted(local, key=lambda x: (bpe_ranks[x[0]], bpe_ranks[x[1]]), reverse=False)
+                merges.extend(local)
+            else:
+                vocab[token] = idx
+        merges = sorted(merges, key=lambda val: val[2], reverse=False)
+        merges = [(token_bytes_to_string(val[0]), token_bytes_to_string(val[1])) for val in merges]
+        return vocab, merges
+
+    def tokenizer(self):
+        vocab_scores, merges = self.extract_vocab_merges_from_model(self.vocab)
+        tokenizer = Tokenizer(BPE(vocab_scores, merges, fuse_unk=False))
+        if hasattr(tokenizer.model, "ignore_merges"):
+            tokenizer.model.ignore_merges = True
+        return tokenizer
+
+    def converted(self) -> Tokenizer:
+        tokenizer = self.tokenizer()
+        tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
+            [
+                pre_tokenizers.Split(Regex(self.pattern), behavior="isolated", invert=False),
+                pre_tokenizers.ByteLevel(add_prefix_space=self.add_prefix_space, use_regex=False),
+            ]
+        )
+        tokenizer.decoder = decoders.ByteLevel()
+        tokenizer.add_special_tokens(self.additional_special_tokens)
+
+        tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
+
+        return tokenizer
+
+
+def convert_tekken_tokenizer(tokenizer_file: str):
+    """Convert a "tekken" tokenizer to a fast Tokenizer."""
+    # Tekken format -- need to use the Converter
+
+    from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
+
+    # Load directly using their lib
+    mistral_tokenizer = MistralTokenizer.from_file(tokenizer_file)
+
+    # Extract vocab and special tokens
+    vocab = mistral_tokenizer.instruct_tokenizer.tokenizer._tekken_token2id_nospecial
+    all_special = [
+        token.value if hasattr(token, "value") else token
+        for token in mistral_tokenizer.instruct_tokenizer.tokenizer._all_special_tokens
+    ]
+    specials_tokens = {token: all_special.index(token) for token in all_special}
+    specials_tokens.update(vocab)
+    vocab = specials_tokens
+
+    # Convert
+    tokenizer = LlamaTokenizerFast(
+        tokenizer_object=MistralConverter(vocab=vocab, additional_special_tokens=all_special).converted(),
+    )
+
+    # Post-process
+    tokenizer.add_special_tokens({"additional_special_tokens": all_special})
+
+    return tokenizer
diff --git a/venv/lib/python3.13/site-packages/transformers/integrations/tiktoken.py b/venv/lib/python3.13/site-packages/transformers/integrations/tiktoken.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f200bc355933bcb9ec119e08b560241fc1433c5
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/integrations/tiktoken.py
@@ -0,0 +1,43 @@
+from pathlib import Path
+from typing import Any
+
+from transformers.convert_slow_tokenizer import TikTokenConverter
+from transformers.tokenization_utils_fast import TIKTOKEN_VOCAB_FILE, TOKENIZER_FILE
+
+
+def convert_tiktoken_to_fast(encoding: Any, output_dir: str):
+    """
+    Converts given `tiktoken` encoding to `PretrainedTokenizerFast` and saves the configuration of converted tokenizer
+    on disk.
+
+    Args:
+        encoding (`str` or `tiktoken.Encoding`):
+            Tokenizer from `tiktoken` library. If `encoding` is `str`, the tokenizer will be loaded with
+            `tiktoken.get_encoding(encoding)`.
+        output_dir (`str`):
+            Save path for converted tokenizer configuration file.
+    """
+    output_dir = Path(output_dir)
+    output_dir.mkdir(exist_ok=True)
+
+    save_file = output_dir / "tiktoken" / TIKTOKEN_VOCAB_FILE
+    tokenizer_file = output_dir / TOKENIZER_FILE
+
+    save_file_absolute = str(save_file.absolute())
+    output_file_absolute = str(tokenizer_file.absolute())
+
+    try:
+        from tiktoken import get_encoding
+        from tiktoken.load import dump_tiktoken_bpe
+
+        if isinstance(encoding, str):
+            encoding = get_encoding(encoding)
+
+        dump_tiktoken_bpe(encoding._mergeable_ranks, save_file_absolute)
+    except ImportError:
+        raise ValueError("`tiktoken` is required to save a `tiktoken` file. Install it with `pip install tiktoken`.")
+
+    tokenizer = TikTokenConverter(
+        vocab_file=save_file_absolute, pattern=encoding._pat_str, additional_special_tokens=encoding._special_tokens
+    ).converted()
+    tokenizer.save(output_file_absolute)
diff --git a/venv/lib/python3.13/site-packages/transformers/integrations/tpu.py b/venv/lib/python3.13/site-packages/transformers/integrations/tpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f7b0df3cd3b905805581fe2cf9c0314c1dda72a
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/integrations/tpu.py
@@ -0,0 +1,36 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from torch.utils.data import DataLoader
+
+from ..utils import is_torch_xla_available
+
+
+def tpu_spmd_dataloader(dataloader: DataLoader):
+    if is_torch_xla_available():
+        import torch_xla.distributed.parallel_loader as pl
+
+        assert isinstance(dataloader, pl.MpDeviceLoader), (
+            "The dataloader must be a `torch_xla.distributed.parallel_loader.MpDeviceLoader`."
+        )
+
+        # This is to support PyTorch/XLA FSDP via SPMD.
+        # Here we shard the input data's 0th dim across the fsdp axis.
+        import torch_xla.distributed.spmd as xs
+
+        sharding_spec = xs.ShardingSpec(xs.get_global_mesh(), ("fsdp", None))
+        dataloader._parallel_loader_kwargs["input_sharding"] = sharding_spec
+        return dataloader
+    else:
+        return dataloader
diff --git a/venv/lib/python3.13/site-packages/transformers/models/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c721f24a506d6b9b361b04f40297f993ec2a96e1
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/__init__.py
@@ -0,0 +1,396 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ..utils import _LazyModule
+from ..utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .aimv2 import *
+    from .albert import *
+    from .align import *
+    from .altclip import *
+    from .arcee import *
+    from .aria import *
+    from .audio_spectrogram_transformer import *
+    from .auto import *
+    from .autoformer import *
+    from .aya_vision import *
+    from .bamba import *
+    from .bark import *
+    from .bart import *
+    from .barthez import *
+    from .bartpho import *
+    from .beit import *
+    from .bert import *
+    from .bert_generation import *
+    from .bert_japanese import *
+    from .bertweet import *
+    from .big_bird import *
+    from .bigbird_pegasus import *
+    from .biogpt import *
+    from .bit import *
+    from .bitnet import *
+    from .blenderbot import *
+    from .blenderbot_small import *
+    from .blip import *
+    from .blip_2 import *
+    from .bloom import *
+    from .blt import *
+    from .bridgetower import *
+    from .bros import *
+    from .byt5 import *
+    from .camembert import *
+    from .canine import *
+    from .chameleon import *
+    from .chinese_clip import *
+    from .clap import *
+    from .clip import *
+    from .clipseg import *
+    from .clvp import *
+    from .code_llama import *
+    from .codegen import *
+    from .cohere import *
+    from .cohere2 import *
+    from .cohere2_vision import *
+    from .colpali import *
+    from .colqwen2 import *
+    from .conditional_detr import *
+    from .convbert import *
+    from .convnext import *
+    from .convnextv2 import *
+    from .cpm import *
+    from .cpmant import *
+    from .csm import *
+    from .ctrl import *
+    from .cvt import *
+    from .d_fine import *
+    from .dab_detr import *
+    from .dac import *
+    from .data2vec import *
+    from .dbrx import *
+    from .deberta import *
+    from .deberta_v2 import *
+    from .decision_transformer import *
+    from .deepseek_v2 import *
+    from .deepseek_v3 import *
+    from .deepseek_vl import *
+    from .deepseek_vl_hybrid import *
+    from .deformable_detr import *
+    from .deit import *
+    from .deprecated import *
+    from .depth_anything import *
+    from .depth_pro import *
+    from .detr import *
+    from .dia import *
+    from .dialogpt import *
+    from .diffllama import *
+    from .dinat import *
+    from .dinov2 import *
+    from .dinov2_with_registers import *
+    from .dinov3_convnext import *
+    from .dinov3_vit import *
+    from .distilbert import *
+    from .dit import *
+    from .donut import *
+    from .dots1 import *
+    from .dpr import *
+    from .dpt import *
+    from .edgetam import *
+    from .edgetam_video import *
+    from .efficientloftr import *
+    from .efficientnet import *
+    from .electra import *
+    from .emu3 import *
+    from .encodec import *
+    from .encoder_decoder import *
+    from .ernie import *
+    from .esm import *
+    from .evolla import *
+    from .exaone4 import *
+    from .falcon import *
+    from .falcon_h1 import *
+    from .falcon_mamba import *
+    from .fastspeech2_conformer import *
+    from .flaubert import *
+    from .flava import *
+    from .flex_olmo import *
+    from .florence2 import *
+    from .fnet import *
+    from .focalnet import *
+    from .fsmt import *
+    from .funnel import *
+    from .fuyu import *
+    from .gemma import *
+    from .gemma2 import *
+    from .gemma3 import *
+    from .gemma3n import *
+    from .git import *
+    from .glm import *
+    from .glm4 import *
+    from .glpn import *
+    from .got_ocr2 import *
+    from .gpt2 import *
+    from .gpt_bigcode import *
+    from .gpt_neo import *
+    from .gpt_neox import *
+    from .gpt_neox_japanese import *
+    from .gpt_oss import *
+    from .gpt_sw3 import *
+    from .gptj import *
+    from .granite import *
+    from .granite_speech import *
+    from .granitemoe import *
+    from .granitemoehybrid import *
+    from .granitemoeshared import *
+    from .grounding_dino import *
+    from .groupvit import *
+    from .helium import *
+    from .herbert import *
+    from .hgnet_v2 import *
+    from .hiera import *
+    from .hubert import *
+    from .hunyuan_v1_dense import *
+    from .hunyuan_v1_moe import *
+    from .ibert import *
+    from .idefics import *
+    from .idefics2 import *
+    from .idefics3 import *
+    from .ijepa import *
+    from .imagegpt import *
+    from .informer import *
+    from .instructblip import *
+    from .instructblipvideo import *
+    from .internvl import *
+    from .jamba import *
+    from .janus import *
+    from .jetmoe import *
+    from .kosmos2 import *
+    from .kyutai_speech_to_text import *
+    from .layoutlm import *
+    from .layoutlmv2 import *
+    from .layoutlmv3 import *
+    from .layoutxlm import *
+    from .led import *
+    from .levit import *
+    from .lfm2 import *
+    from .lfm2_vl import *
+    from .lightglue import *
+    from .lilt import *
+    from .llama import *
+    from .llama4 import *
+    from .llava import *
+    from .llava_next import *
+    from .llava_next_video import *
+    from .llava_onevision import *
+    from .longcat_flash import *
+    from .longformer import *
+    from .longt5 import *
+    from .luke import *
+    from .lxmert import *
+    from .m2m_100 import *
+    from .mamba import *
+    from .mamba2 import *
+    from .marian import *
+    from .markuplm import *
+    from .mask2former import *
+    from .maskformer import *
+    from .mbart import *
+    from .mbart50 import *
+    from .megatron_bert import *
+    from .megatron_gpt2 import *
+    from .mgp_str import *
+    from .mimi import *
+    from .minimax import *
+    from .ministral import *
+    from .mistral import *
+    from .mistral3 import *
+    from .mixtral import *
+    from .mlcd import *
+    from .mllama import *
+    from .mluke import *
+    from .mobilebert import *
+    from .mobilenet_v1 import *
+    from .mobilenet_v2 import *
+    from .mobilevit import *
+    from .mobilevitv2 import *
+    from .modernbert import *
+    from .modernbert_decoder import *
+    from .moonshine import *
+    from .moshi import *
+    from .mpnet import *
+    from .mpt import *
+    from .mra import *
+    from .mt5 import *
+    from .musicgen import *
+    from .musicgen_melody import *
+    from .mvp import *
+    from .myt5 import *
+    from .nemotron import *
+    from .nllb import *
+    from .nllb_moe import *
+    from .nougat import *
+    from .nystromformer import *
+    from .olmo import *
+    from .olmo2 import *
+    from .olmo3 import *
+    from .olmoe import *
+    from .omdet_turbo import *
+    from .oneformer import *
+    from .openai import *
+    from .opt import *
+    from .ovis2 import *
+    from .owlv2 import *
+    from .owlvit import *
+    from .paligemma import *
+    from .parakeet import *
+    from .patchtsmixer import *
+    from .patchtst import *
+    from .pegasus import *
+    from .pegasus_x import *
+    from .perceiver import *
+    from .perception_lm import *
+    from .persimmon import *
+    from .phi import *
+    from .phi3 import *
+    from .phi4_multimodal import *
+    from .phimoe import *
+    from .phobert import *
+    from .pix2struct import *
+    from .pixtral import *
+    from .plbart import *
+    from .poolformer import *
+    from .pop2piano import *
+    from .prompt_depth_anything import *
+    from .prophetnet import *
+    from .pvt import *
+    from .pvt_v2 import *
+    from .qwen2 import *
+    from .qwen2_5_omni import *
+    from .qwen2_5_vl import *
+    from .qwen2_audio import *
+    from .qwen2_moe import *
+    from .qwen2_vl import *
+    from .qwen3 import *
+    from .qwen3_moe import *
+    from .qwen3_next import *
+    from .qwen3_omni_moe import *
+    from .qwen3_vl import *
+    from .qwen3_vl_moe import *
+    from .rag import *
+    from .recurrent_gemma import *
+    from .reformer import *
+    from .regnet import *
+    from .rembert import *
+    from .resnet import *
+    from .roberta import *
+    from .roberta_prelayernorm import *
+    from .roc_bert import *
+    from .roformer import *
+    from .rt_detr import *
+    from .rt_detr_v2 import *
+    from .rwkv import *
+    from .sam import *
+    from .sam2 import *
+    from .sam2_video import *
+    from .sam_hq import *
+    from .seamless_m4t import *
+    from .seamless_m4t_v2 import *
+    from .seed_oss import *
+    from .segformer import *
+    from .seggpt import *
+    from .sew import *
+    from .sew_d import *
+    from .shieldgemma2 import *
+    from .siglip import *
+    from .siglip2 import *
+    from .smolvlm import *
+    from .speech_encoder_decoder import *
+    from .speech_to_text import *
+    from .speecht5 import *
+    from .splinter import *
+    from .squeezebert import *
+    from .stablelm import *
+    from .starcoder2 import *
+    from .superglue import *
+    from .superpoint import *
+    from .swiftformer import *
+    from .swin import *
+    from .swin2sr import *
+    from .swinv2 import *
+    from .switch_transformers import *
+    from .t5 import *
+    from .t5gemma import *
+    from .table_transformer import *
+    from .tapas import *
+    from .textnet import *
+    from .time_series_transformer import *
+    from .timesfm import *
+    from .timesformer import *
+    from .timm_backbone import *
+    from .timm_wrapper import *
+    from .trocr import *
+    from .tvp import *
+    from .udop import *
+    from .umt5 import *
+    from .unispeech import *
+    from .unispeech_sat import *
+    from .univnet import *
+    from .upernet import *
+    from .vaultgemma import *
+    from .video_llava import *
+    from .videomae import *
+    from .vilt import *
+    from .vipllava import *
+    from .vision_encoder_decoder import *
+    from .vision_text_dual_encoder import *
+    from .visual_bert import *
+    from .vit import *
+    from .vit_mae import *
+    from .vit_msn import *
+    from .vitdet import *
+    from .vitmatte import *
+    from .vitpose import *
+    from .vitpose_backbone import *
+    from .vits import *
+    from .vivit import *
+    from .vjepa2 import *
+    from .voxtral import *
+    from .wav2vec2 import *
+    from .wav2vec2_bert import *
+    from .wav2vec2_conformer import *
+    from .wav2vec2_phoneme import *
+    from .wav2vec2_with_lm import *
+    from .wavlm import *
+    from .whisper import *
+    from .x_clip import *
+    from .xcodec import *
+    from .xglm import *
+    from .xlm import *
+    from .xlm_roberta import *
+    from .xlm_roberta_xl import *
+    from .xlnet import *
+    from .xlstm import *
+    from .xmod import *
+    from .yolos import *
+    from .yoso import *
+    from .zamba import *
+    from .zamba2 import *
+    from .zoedepth import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/bartpho/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/bartpho/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..597be95d8175cac9d48edf3eb4d42a2a91b95833
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/bartpho/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .tokenization_bartpho import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/bartpho/tokenization_bartpho.py b/venv/lib/python3.13/site-packages/transformers/models/bartpho/tokenization_bartpho.py
new file mode 100644
index 0000000000000000000000000000000000000000..41a122bf913c54526cdd10c14fbbea5896da2d00
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/bartpho/tokenization_bartpho.py
@@ -0,0 +1,318 @@
+# coding=utf-8
+# Copyright 2021 VinAI Research and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+"""Tokenization classes for BARTpho-syllable model."""
+
+import os
+from shutil import copyfile
+from typing import Any, Optional
+
+import sentencepiece as spm
+
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...utils import logging
+from ...utils.import_utils import requires
+
+
+logger = logging.get_logger(__name__)
+
+SPIECE_UNDERLINE = "▁"
+
+VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "monolingual_vocab_file": "dict.txt"}
+
+
+@requires(backends=("sentencepiece",))
+class BartphoTokenizer(PreTrainedTokenizer):
+    """
+    Adapted from [`XLMRobertaTokenizer`]. Based on [SentencePiece](https://github.com/google/sentencepiece).
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file. This vocabulary is the pre-trained SentencePiece model available from the
+            multilingual XLM-RoBERTa, also used in mBART, consisting of 250K types.
+        monolingual_vocab_file (`str`):
+            Path to the monolingual vocabulary file. This monolingual vocabulary consists of Vietnamese-specialized
+            types extracted from the multilingual vocabulary vocab_file of 250K types.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+
+    Attributes:
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        monolingual_vocab_file,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        sp_model_kwargs: Optional[dict[str, Any]] = None,
+        **kwargs,
+    ) -> None:
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        self.vocab_file = vocab_file
+        self.monolingual_vocab_file = monolingual_vocab_file
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(str(vocab_file))
+
+        # Load the reduced vocab
+
+        # Keep order of special tokens for backward compatibility
+        self.fairseq_tokens_to_ids = {}
+        cnt = 0
+        for token in [bos_token, pad_token, eos_token, unk_token, sep_token, cls_token]:
+            if str(token) not in self.fairseq_tokens_to_ids:
+                self.fairseq_tokens_to_ids[str(token)] = cnt
+                cnt += 1
+        with open(monolingual_vocab_file, "r", encoding="utf-8") as f:
+            for line in f.readlines():
+                token = line.strip().split()[0]
+                self.fairseq_tokens_to_ids[token] = len(self.fairseq_tokens_to_ids)
+        if str(mask_token) not in self.fairseq_tokens_to_ids:
+            self.fairseq_tokens_to_ids[str(mask_token)] = len(self.fairseq_tokens_to_ids)
+
+        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An BARTPho sequence has the following format:
+
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. BARTPho does not
+        make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of zeros.
+
+        """
+
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    @property
+    def vocab_size(self):
+        return len(self.fairseq_ids_to_tokens)
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text: str) -> list[str]:
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        if token in self.fairseq_tokens_to_ids:
+            return self.fairseq_tokens_to_ids[token]
+        else:
+            return self.unk_token_id
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.fairseq_ids_to_tokens[index]
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+        return out_string
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        out_monolingual_vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["monolingual_vocab_file"],
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        if os.path.abspath(self.monolingual_vocab_file) != os.path.abspath(
+            out_monolingual_vocab_file
+        ) and os.path.isfile(self.monolingual_vocab_file):
+            copyfile(self.monolingual_vocab_file, out_monolingual_vocab_file)
+        elif not os.path.isfile(self.monolingual_vocab_file):
+            with open(out_monolingual_vocab_file, "w", encoding="utf-8") as fp:
+                for token in self.fairseq_tokens_to_ids:
+                    if token not in self.all_special_tokens:
+                        fp.write(f"{str(token)} \n")
+
+        return out_vocab_file, out_monolingual_vocab_file
+
+
+__all__ = ["BartphoTokenizer"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/cohere2_vision/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/cohere2_vision/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b20eb3c1e0a8b42669761c9c45ca292b490df27
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/cohere2_vision/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_cohere2_vision import *
+    from .image_processing_cohere2_vision_fast import *
+    from .modeling_cohere2_vision import *
+    from .processing_cohere2_vision import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/cohere2_vision/configuration_cohere2_vision.py b/venv/lib/python3.13/site-packages/transformers/models/cohere2_vision/configuration_cohere2_vision.py
new file mode 100644
index 0000000000000000000000000000000000000000..acc40fcf85711961a12f9536e6f4c0c97e94f59e
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/cohere2_vision/configuration_cohere2_vision.py
@@ -0,0 +1,82 @@
+# Copyright 2025 the Cohere Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...configuration_utils import PretrainedConfig
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+class Cohere2VisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Cohere2VisionForConditionalGeneration`]. It is used to instantiate an
+    Cohere2 Vision model according to the specified arguments, defining the model architecture.
+
+    [CohereLabs/command-a-vision-07-2025](https://huggingface.co/CohereLabs/command-a-vision-07-2025)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `SiglipVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Cohere2Config`):
+            The config object or dictionary of the text backbone.
+        downsample_factor (`int`, *optional*, defaults to 2):
+            The factor by which to downsample the input image.
+        image_token_id (`int`, *optional*, defaults to 255036):
+            The token ID to use as placeholder for the image input.
+        alignment_intermediate_size (`int`, *optional*, defaults to 36864):
+            The size of the intermediate layer for alignment.
+    """
+
+    model_type = "cohere2_vision"
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
+
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        downsample_factor=2,
+        image_token_id=255036,
+        alignment_intermediate_size=36864,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.downsample_factor = downsample_factor
+        self.image_token_id = image_token_id
+        self.alignment_intermediate_size = alignment_intermediate_size
+
+        if isinstance(vision_config, dict):
+            vision_config["model_type"] = vision_config.get("model_type", "siglip_vision_model")
+            vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+        elif vision_config is None:
+            vision_config = CONFIG_MAPPING["siglip_vision_model"](
+                hidden_size=1152,
+                intermediate_size=3072,
+                image_size=512,
+                num_hidden_layers=27,
+                num_attention_heads=12,
+            )
+
+        self.vision_config = vision_config
+
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config.get("model_type", "cohere2")
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["cohere2"](tie_word_embeddings=True)
+
+        self.text_config = text_config
+
+
+__all__ = ["Cohere2VisionConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py b/venv/lib/python3.13/site-packages/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..322e98dbd0f592fd152a2f87c3c5b9849d4dee29
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py
@@ -0,0 +1,306 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/cohere2_vision/modular_cohere2_vision.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_cohere2_vision.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 the Cohere Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import lru_cache
+from typing import Optional, Union
+
+import numpy as np
+import torch
+from torchvision.transforms.v2 import functional as F
+
+from ...image_processing_utils import BatchFeature
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    group_images_by_shape,
+    reorder_images,
+)
+from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ImageInput, PILImageResampling, SizeDict
+from ...processing_utils import Unpack
+from ...utils import TensorType, auto_docstring
+
+
+class Cohere2VisionFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    """
+    crop_to_patches (`bool`, *optional*, defaults to `False`):
+        Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
+        `preprocess` method.
+    min_patches (`int`, *optional*, defaults to 1):
+        The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+        set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method.
+    max_patches (`int`, *optional*, defaults to 12):
+        The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+        set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
+    """
+
+    crop_to_patches: Optional[bool]
+    min_patches: Optional[int]
+    max_patches: Optional[int]
+
+
+@lru_cache(maxsize=10)
+def get_all_supported_aspect_ratios(max_image_tiles: int) -> list[tuple[int, int]]:
+    """
+    Computes all allowed aspect ratios for a given maximum number of input tiles.
+
+    This function calculates all possible arrangements of tiles that can be formed
+    within the constraint of the maximum number of tiles. Each arrangement is
+    represented by its aspect ratio (width/height) and the corresponding tile configuration.
+
+    Args:
+        max_image_tiles (`int`):
+            The maximum number of tiles allowed.
+
+    Returns:
+        `list[tuple[int, int]]`: A list of tuples, each tuple representing a valid (width, height)
+        configuration in terms of number of tiles.
+
+    Example:
+        >>> get_all_supported_aspect_ratios(4)
+        [(1, 1), (1, 2), (1, 3), (1, 4), (2, 1), (2, 2), (3, 1), (4, 1)]
+
+    """
+    aspect_ratios = []
+    for width in range(1, max_image_tiles + 1):
+        for height in range(1, max_image_tiles + 1):
+            if width * height <= max_image_tiles:
+                aspect_ratios.append((width, height))
+    return aspect_ratios
+
+
+def get_optimal_tiled_canvas(
+    original_image_size: tuple[int, int],
+    target_tile_size: tuple[int, int],
+    min_image_tiles: int,
+    max_image_tiles: int,
+) -> tuple[int, int]:
+    possible_resolutions = get_all_supported_aspect_ratios(max_image_tiles)
+    possible_resolutions = sorted(possible_resolutions, key=lambda x: x[0] * x[1])
+    image_height, image_width = original_image_size
+    patch_size_height, patch_size_width = target_tile_size  # (height == width)
+
+    candidate_resolutions = np.array(possible_resolutions) * patch_size_height
+    original_size = np.stack([image_height, image_width])
+    required_scales = candidate_resolutions / original_size
+    required_scale = np.min(required_scales, axis=-1, keepdims=True)  # [n_resolutions, 1]
+    if np.all(required_scale < 1):
+        # We are forced to downscale, so try to minimize the amount of downscaling
+        best_grid = possible_resolutions[np.argmax(required_scale)]
+    else:
+        # Pick the resolution that required the least upscaling so that it most closely fits the image
+        required_scale = np.where(required_scale < 1.0, 10e9, required_scale)
+        best_grid = possible_resolutions[np.argmin(required_scale)]
+    return best_grid
+
+
+@auto_docstring
+class Cohere2VisionImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BICUBIC
+    image_mean = OPENAI_CLIP_MEAN
+    image_std = OPENAI_CLIP_STD
+    size = {"height": 512, "width": 512}
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    do_convert_rgb = True
+    crop_to_patches = True
+    min_patches = 1
+    max_patches = 12
+    valid_kwargs = Cohere2VisionFastImageProcessorKwargs
+    patch_size = 16
+
+    def __init__(self, **kwargs: Unpack[Cohere2VisionFastImageProcessorKwargs]):
+        super().__init__(**kwargs)
+
+    @auto_docstring
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[Cohere2VisionFastImageProcessorKwargs]) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+
+    def crop_image_to_patches(
+        self,
+        images: "torch.Tensor",
+        min_patches: int,
+        max_patches: int,
+        use_thumbnail: bool = True,
+        patch_size: Optional[Union[tuple, int, dict]] = None,
+        interpolation: Optional["F.InterpolationMode"] = None,
+    ):
+        """
+        Crop the images to patches and return a list of cropped images.
+        The number of patches and their grid arrangement are determined by the original image size,
+        the target patch size and the minimum and maximum number of patches.
+        The aspect ratio of the patches grid is chosen to be the closest to the original image aspect ratio.
+
+        Args:
+            images (`torch.Tensor`):
+                The images to be cropped.
+            min_patches (`int`):
+                The minimum number of patches to be extracted from the image.
+            max_patches (`int`):
+                The maximum number of patches to be extracted from the image.
+            use_thumbnail (`bool`, *optional*, defaults to `True`):
+                Whether to add a thumbnail image to the list of cropped patches.
+            patch_size (`int`, `tuple[int, int]`, `dict`, *optional*):
+                The size of the output patches.
+                The format of the image data. If `None`, the format is inferred from the input image.
+
+        Returns:
+            list[`PIL.Image.Image`] or list[np.ndarray]: The list of cropped images.
+        """
+        patch_size_height, patch_size_width = patch_size.height, patch_size.width
+        original_height, original_width = images.shape[-2:]
+        # find the closest aspect ratio to the target
+        num_columns, num_rows = get_optimal_tiled_canvas(
+            (original_height, original_width), (patch_size_height, patch_size_width), min_patches, max_patches
+        )
+
+        # calculate the target width and height
+        target_width = patch_size_width * num_columns
+        target_height = patch_size_height * num_rows
+        num_blocks = num_columns * num_rows
+
+        # resize the image so that each patch is of patch_size
+        resized_image = self.resize(
+            images, SizeDict(height=target_height, width=target_width), interpolation=interpolation
+        )
+        # split the image into patches
+        processed_images = []
+        for i in range(num_blocks):
+            column = i % num_columns
+            row = i // num_columns
+            box = (
+                column * patch_size_width,
+                row * patch_size_height,
+                (column + 1) * patch_size_width,
+                (row + 1) * patch_size_height,
+            )
+            # split the image
+            patch_image = resized_image[..., box[1] : box[3], box[0] : box[2]]
+            processed_images.append(patch_image)
+
+        if use_thumbnail and len(processed_images) != 1:
+            thumbnail_img = self.resize(images, patch_size, interpolation=interpolation)
+            processed_images.append(thumbnail_img)
+
+        processed_images = torch.stack(processed_images, dim=0).transpose(0, 1).contiguous()
+
+        return processed_images
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        crop_to_patches: bool,
+        min_patches: int,
+        max_patches: int,
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> BatchFeature:
+        if crop_to_patches:
+            grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+            processed_images_grouped = {}
+            num_patches = {}
+            for shape, stacked_images in grouped_images.items():
+                stacked_images = self.crop_image_to_patches(
+                    stacked_images,
+                    min_patches,
+                    max_patches,
+                    patch_size=size,
+                    interpolation=interpolation,
+                )
+                processed_images_grouped[shape] = stacked_images
+                num_patches[shape] = [stacked_images.shape[1]] * stacked_images.shape[0]
+            images = reorder_images(processed_images_grouped, grouped_images_index)
+            images = [image for images_list in images for image in images_list]
+            num_patches = reorder_images(num_patches, grouped_images_index)
+        else:
+            num_patches = [1] * len(images)
+
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_center_crop:
+                stacked_images = self.center_crop(stacked_images, crop_size)
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+
+        return BatchFeature(
+            data={"pixel_values": processed_images, "num_patches": num_patches}, tensor_type=return_tensors
+        )
+
+    def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
+        """
+        A utility that returns number patches for a given image size.
+
+        Args:
+            height (`int`):
+                Height of the input image.
+            width (`int`):
+                Width of the input image.
+            images_kwargs (`dict`, *optional*)
+                Any kwargs to override defaults of the image processor.
+        Returns:
+            `int`: Number of patches per image.
+        """
+        min_patches = images_kwargs.get("min_patches", self.min_patches)
+        max_patches = images_kwargs.get("max_patches", self.max_patches)
+        patch_size = images_kwargs.get("patch_size", self.size)
+        crop_to_patches = images_kwargs.get("crop_to_patches", self.crop_to_patches)
+
+        num_patches = 1
+        if crop_to_patches and max_patches > 1:
+            num_columns, num_rows = get_optimal_tiled_canvas(
+                (height, width), (patch_size["height"], patch_size["width"]), min_patches, max_patches
+            )
+            if num_columns * num_rows > 1:
+                num_patches += num_columns * num_rows
+
+        return num_patches
+
+
+__all__ = ["Cohere2VisionImageProcessorFast"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/cohere2_vision/modeling_cohere2_vision.py b/venv/lib/python3.13/site-packages/transformers/models/cohere2_vision/modeling_cohere2_vision.py
new file mode 100644
index 0000000000000000000000000000000000000000..5337f0dac45ac32134ff77a014c29c3c8fd86a19
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/cohere2_vision/modeling_cohere2_vision.py
@@ -0,0 +1,425 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/cohere2_vision/modular_cohere2_vision.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_cohere2_vision.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 the Cohere Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ...cache_utils import Cache
+from ...generation import GenerationMixin
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring
+from ...utils.generic import check_model_inputs
+from ..auto import AutoModel
+from .configuration_cohere2_vision import Cohere2VisionConfig
+
+
+class Cohere2VisionMultiModalProjector(nn.Module):
+    def __init__(self, config: Cohere2VisionConfig):
+        super().__init__()
+        self.config = config
+        self.downsample_factor = config.downsample_factor
+        self.intermediate_size = config.alignment_intermediate_size
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size * (config.downsample_factor**2), self.intermediate_size, bias=True
+        )
+        self.act = nn.SiLU()
+        self.linear_2 = nn.Linear(self.intermediate_size // 2, config.text_config.hidden_size, bias=True)
+
+    def pixel_shuffle(self, image_features):  # B, S, D
+        batch_size, seq_length, feature_dim = image_features.shape
+        height = width = int(seq_length**0.5)
+        image_features = image_features.reshape(image_features.shape[0], width, height, -1)
+        channels = image_features.shape[-1]
+        image_features = image_features.reshape(
+            batch_size, width, int(height / self.downsample_factor), int(channels * self.downsample_factor)
+        )
+        image_features = image_features.permute(0, 2, 1, 3)
+        image_features = image_features.reshape(
+            batch_size, int(height / self.downsample_factor), int(width / self.downsample_factor), -1
+        )
+        image_features = image_features.permute(0, 2, 1, 3)
+        return image_features
+
+    def forward(self, image_features):
+        image_features = self.pixel_shuffle(image_features)
+        hidden_states = self.linear_1(image_features)
+
+        # Split along last dimension and apply SwiGLU
+        x, gate = hidden_states.chunk(2, dim=-1)
+        hidden_states = self.act(gate) * x
+
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Cohere2Vision outputs, with hidden states and attentions.
+    """
+)
+class Cohere2VisionModelOutputWithPast(BaseModelOutputWithPast):
+    r"""
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Cohere2Vision causal language model (or autoregressive) outputs.
+    """
+)
+class Cohere2VisionCausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@auto_docstring
+class Cohere2VisionPreTrainedModel(PreTrainedModel):
+    config: Cohere2VisionConfig
+    base_model_prefix = ""
+    supports_gradient_checkpointing = True
+    _skip_keys_device_placement = "past_key_values"
+
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _can_compile_fullgraph = False
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": "DecoderLayer",
+        "attentions": "Attention",
+    }
+
+
+@auto_docstring(
+    custom_intro="""
+    The Cohere2Vision model which consists of a vision backbone and a language model, without a language modeling head.
+    """
+)
+class Cohere2VisionModel(Cohere2VisionPreTrainedModel):
+    _checkpoint_conversion_mapping = {}
+
+    def __init__(self, config: Cohere2VisionConfig):
+        super().__init__(config)
+        self.vision_tower = AutoModel.from_config(config.vision_config)
+
+        self.multi_modal_projector = Cohere2VisionMultiModalProjector(config)
+        self.language_model = AutoModel.from_config(config.text_config)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+
+    def get_decoder(self):
+        return self.language_model
+
+    def get_image_features(self, pixel_values: torch.FloatTensor):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`)
+               The tensors corresponding to the input images.
+        Returns:
+            image_features (List[`torch.Tensor`]): List of image feature tensor, each contains all the visual feature of all patches
+            and are of shape `(num_patches, image_length, embed_dim)`).
+        """
+
+        image_features = self.vision_tower(pixel_values, output_hidden_states=True)
+        selected_image_feature = image_features.last_hidden_state
+        image_features = self.multi_modal_projector(selected_image_feature)
+        return image_features
+
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        n_image_features = image_features.shape[0] * image_features.shape[1]
+        if inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+            )
+        return special_image_mask
+
+    @check_model_inputs()
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, Cohere2VisionModelOutputWithPast]:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None:
+            image_features = self.get_image_features(pixel_values)
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        return Cohere2VisionModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The COHERE2_VISION model which consists of a vision backbone and a language model.
+    """
+)
+class Cohere2VisionForConditionalGeneration(Cohere2VisionPreTrainedModel, GenerationMixin):
+    _checkpoint_conversion_mapping = {}
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: Cohere2VisionConfig):
+        super().__init__(config)
+        self.model = Cohere2VisionModel(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.lm_head
+
+    def set_decoder(self, decoder):
+        self.model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def get_image_features(self, pixel_values: torch.FloatTensor):
+        return self.model.get_image_features(pixel_values=pixel_values)
+
+    # Make modules available through conditional class for BC
+    @property
+    def language_model(self):
+        return self.model.language_model
+
+    @property
+    def vision_tower(self):
+        return self.model.vision_tower
+
+    @property
+    def multi_modal_projector(self):
+        return self.model.multi_modal_projector
+
+    @check_model_inputs()
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        image_sizes: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, Cohere2VisionCausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoProcessor, Cohere2VisionForConditionalGeneration
+        >>> import torch
+
+        >>> processor = AutoProcessor.from_pretrained("CohereLabs/command-a-vision-07-2025", use_fast=True)
+        >>> model = Cohere2VisionForConditionalGeneration.from_pretrained("CohereLabs/command-a-vision-07-2025", device_map="auto")
+
+        >>> messages = [
+        ...     {
+        ...         "role": "user",
+        ...         "content": [
+        ...             {
+        ...                 "type": "image",
+        ...                 "url": "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg",
+        ...             },
+        ...             {"type": "text", "text": "what is in this image?"},
+        ...         ],
+        ...     },
+        ... ]
+
+        >>> inputs = processor.apply_chat_template(
+        ...     messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt",
+        ... ).to(model.device)
+
+        >>> gen_tokens = model.generate(**inputs, max_new_tokens=300, do_sample=True, temperature=0.3)
+        >>> processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
+        ```"""
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            image_sizes=image_sizes,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+            )
+
+        return Cohere2VisionCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        attention_mask=None,
+        cache_position=None,
+        logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        if cache_position[0] == 0:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            model_inputs["pixel_values"] = pixel_values
+
+        return model_inputs
+
+
+__all__ = ["Cohere2VisionForConditionalGeneration", "Cohere2VisionPreTrainedModel", "Cohere2VisionModel"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/cohere2_vision/modular_cohere2_vision.py b/venv/lib/python3.13/site-packages/transformers/models/cohere2_vision/modular_cohere2_vision.py
new file mode 100644
index 0000000000000000000000000000000000000000..732ae452dcbc1ed526bf095a2a7de9d59e390422
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/cohere2_vision/modular_cohere2_vision.py
@@ -0,0 +1,318 @@
+# coding=utf-8
+# Copyright 2025 the Cohere Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch AyaVision model."""
+
+from functools import lru_cache
+from typing import Optional, Union
+
+import numpy as np
+import torch
+from torch import nn
+
+from transformers.models.aya_vision.modeling_aya_vision import (
+    AyaVisionCausalLMOutputWithPast,
+    AyaVisionForConditionalGeneration,
+    AyaVisionModel,
+    AyaVisionModelOutputWithPast,
+)
+from transformers.models.got_ocr2.image_processing_got_ocr2_fast import GotOcr2ImageProcessorFast
+
+from ...cache_utils import Cache
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, logging
+from ...utils.generic import check_model_inputs
+from .configuration_cohere2_vision import Cohere2VisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class Cohere2VisionMultiModalProjector(nn.Module):
+    def __init__(self, config: Cohere2VisionConfig):
+        super().__init__()
+        self.config = config
+        self.downsample_factor = config.downsample_factor
+        self.intermediate_size = config.alignment_intermediate_size
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size * (config.downsample_factor**2), self.intermediate_size, bias=True
+        )
+        self.act = nn.SiLU()
+        self.linear_2 = nn.Linear(self.intermediate_size // 2, config.text_config.hidden_size, bias=True)
+
+    def pixel_shuffle(self, image_features):  # B, S, D
+        batch_size, seq_length, feature_dim = image_features.shape
+        height = width = int(seq_length**0.5)
+        image_features = image_features.reshape(image_features.shape[0], width, height, -1)
+        channels = image_features.shape[-1]
+        image_features = image_features.reshape(
+            batch_size, width, int(height / self.downsample_factor), int(channels * self.downsample_factor)
+        )
+        image_features = image_features.permute(0, 2, 1, 3)
+        image_features = image_features.reshape(
+            batch_size, int(height / self.downsample_factor), int(width / self.downsample_factor), -1
+        )
+        image_features = image_features.permute(0, 2, 1, 3)
+        return image_features
+
+    def forward(self, image_features):
+        image_features = self.pixel_shuffle(image_features)
+        hidden_states = self.linear_1(image_features)
+
+        # Split along last dimension and apply SwiGLU
+        x, gate = hidden_states.chunk(2, dim=-1)
+        hidden_states = self.act(gate) * x
+
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class Cohere2VisionModelOutputWithPast(AyaVisionModelOutputWithPast):
+    pass
+
+
+class Cohere2VisionCausalLMOutputWithPast(AyaVisionCausalLMOutputWithPast):
+    pass
+
+
+class Cohere2VisionModel(AyaVisionModel):
+    _checkpoint_conversion_mapping = {}
+
+    def get_image_features(self, pixel_values: torch.FloatTensor):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`)
+               The tensors corresponding to the input images.
+        Returns:
+            image_features (List[`torch.Tensor`]): List of image feature tensor, each contains all the visual feature of all patches
+            and are of shape `(num_patches, image_length, embed_dim)`).
+        """
+
+        image_features = self.vision_tower(pixel_values, output_hidden_states=True)
+        selected_image_feature = image_features.last_hidden_state
+        image_features = self.multi_modal_projector(selected_image_feature)
+        return image_features
+
+    @check_model_inputs()
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, Cohere2VisionModelOutputWithPast]:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None:
+            image_features = self.get_image_features(pixel_values)
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        return Cohere2VisionModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
+
+class Cohere2VisionForConditionalGeneration(AyaVisionForConditionalGeneration):
+    _checkpoint_conversion_mapping = {}
+
+    def get_image_features(self, pixel_values: torch.FloatTensor):
+        return self.model.get_image_features(pixel_values=pixel_values)
+
+    @check_model_inputs()
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        image_sizes: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, Cohere2VisionCausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoProcessor, Cohere2VisionForConditionalGeneration
+        >>> import torch
+
+        >>> processor = AutoProcessor.from_pretrained("CohereLabs/command-a-vision-07-2025", use_fast=True)
+        >>> model = Cohere2VisionForConditionalGeneration.from_pretrained("CohereLabs/command-a-vision-07-2025", device_map="auto")
+
+        >>> messages = [
+        ...     {
+        ...         "role": "user",
+        ...         "content": [
+        ...             {
+        ...                 "type": "image",
+        ...                 "url": "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg",
+        ...             },
+        ...             {"type": "text", "text": "what is in this image?"},
+        ...         ],
+        ...     },
+        ... ]
+
+        >>> inputs = processor.apply_chat_template(
+        ...     messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt",
+        ... ).to(model.device)
+
+        >>> gen_tokens = model.generate(**inputs, max_new_tokens=300, do_sample=True, temperature=0.3)
+        >>> processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
+        ```"""
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            image_sizes=image_sizes,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+            )
+
+        return Cohere2VisionCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+
+@lru_cache(maxsize=10)
+def get_all_supported_aspect_ratios(max_image_tiles: int) -> list[tuple[int, int]]:
+    """
+    Computes all allowed aspect ratios for a given maximum number of input tiles.
+
+    This function calculates all possible arrangements of tiles that can be formed
+    within the constraint of the maximum number of tiles. Each arrangement is
+    represented by its aspect ratio (width/height) and the corresponding tile configuration.
+
+    Args:
+        max_image_tiles (`int`):
+            The maximum number of tiles allowed.
+
+    Returns:
+        `list[tuple[int, int]]`: A list of tuples, each tuple representing a valid (width, height)
+        configuration in terms of number of tiles.
+
+    Example:
+        >>> get_all_supported_aspect_ratios(4)
+        [(1, 1), (1, 2), (1, 3), (1, 4), (2, 1), (2, 2), (3, 1), (4, 1)]
+
+    """
+    aspect_ratios = []
+    for width in range(1, max_image_tiles + 1):
+        for height in range(1, max_image_tiles + 1):
+            if width * height <= max_image_tiles:
+                aspect_ratios.append((width, height))
+    return aspect_ratios
+
+
+def get_optimal_tiled_canvas(
+    original_image_size: tuple[int, int],
+    target_tile_size: tuple[int, int],
+    min_image_tiles: int,
+    max_image_tiles: int,
+) -> tuple[int, int]:
+    possible_resolutions = get_all_supported_aspect_ratios(max_image_tiles)
+    possible_resolutions = sorted(possible_resolutions, key=lambda x: x[0] * x[1])
+    image_height, image_width = original_image_size
+    patch_size_height, patch_size_width = target_tile_size  # (height == width)
+
+    candidate_resolutions = np.array(possible_resolutions) * patch_size_height
+    original_size = np.stack([image_height, image_width])
+    required_scales = candidate_resolutions / original_size
+    required_scale = np.min(required_scales, axis=-1, keepdims=True)  # [n_resolutions, 1]
+    if np.all(required_scale < 1):
+        # We are forced to downscale, so try to minimize the amount of downscaling
+        best_grid = possible_resolutions[np.argmax(required_scale)]
+    else:
+        # Pick the resolution that required the least upscaling so that it most closely fits the image
+        required_scale = np.where(required_scale < 1.0, 10e9, required_scale)
+        best_grid = possible_resolutions[np.argmin(required_scale)]
+    return best_grid
+
+
+@auto_docstring
+class Cohere2VisionImageProcessorFast(GotOcr2ImageProcessorFast):
+    size = {"height": 512, "width": 512}
+    min_patches = 1
+    max_patches = 12
+    crop_to_patches = True
+    patch_size = 16
+
+
+__all__ = [
+    "Cohere2VisionForConditionalGeneration",
+    "Cohere2VisionPreTrainedModel",  # noqa: F822
+    "Cohere2VisionModel",
+    "Cohere2VisionImageProcessorFast",
+]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/cohere2_vision/processing_cohere2_vision.py b/venv/lib/python3.13/site-packages/transformers/models/cohere2_vision/processing_cohere2_vision.py
new file mode 100644
index 0000000000000000000000000000000000000000..b72e1512ead9612c9e601101326f532ed3bf0d1a
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/cohere2_vision/processing_cohere2_vision.py
@@ -0,0 +1,216 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+
+
+class Cohere2VisionImagesKwargs(ImagesKwargs, total=False):
+    max_patches: Optional[int]
+
+
+class Cohere2VisionProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Cohere2VisionImagesKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding_side": "left",
+            "padding": True,
+            "return_mm_token_type_ids": False,
+        },
+    }
+
+
+class Cohere2VisionProcessor(ProcessorMixin):
+    r"""
+    Constructs a Cohere2Vision processor which wraps a [`AutoImageProcessor`] and
+    [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and
+    tokenizer functionalities. See the [`~Cohere2VisionProcessor.__call__`] and [`~Cohere2VisionProcessor.decode`] for more information.
+    Args:
+        image_processor ([`AutoImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        chat_template=None,
+        **kwargs,
+    ):
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+        self.patch_size = self.image_processor.patch_size
+        self.boi_token = tokenizer.boi_token
+        self.eoi_token = tokenizer.eoi_token
+        self.image_token = tokenizer.image_token
+        self.img_line_break_token = tokenizer.img_line_break_token
+        self.image_token_id = tokenizer.image_token_id
+
+        self.image_ids = tokenizer.convert_tokens_to_ids(
+            [
+                self.image_token,
+                self.boi_token,
+                self.eoi_token,
+                self.img_line_break_token,
+            ]
+        )
+
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
+        **kwargs: Unpack[Cohere2VisionProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text.
+        To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to
+        GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`.
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        if text is None:
+            raise ValueError("You have to specify text.")
+        elif not isinstance(text, (list, tuple)):
+            text = [text]
+
+        output_kwargs = self._merge_kwargs(
+            Cohere2VisionProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        # Process images
+        image_inputs = {}
+        if images is not None:
+            image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
+            batch_num_patches = iter(image_inputs.pop("num_patches"))
+            processed_text = []
+            for sample in text:
+                while self.image_token in sample:
+                    num_patches = next(batch_num_patches)
+                    img_patches_per_tile = int(self.patch_size**2)
+
+                    img_string = f"{self.boi_token}"
+                    for idx in range(1, num_patches):
+                        img_string += "<placeholder>" * img_patches_per_tile + self.img_line_break_token
+                    img_string += "<placeholder>" * img_patches_per_tile + self.img_line_break_token
+                    img_string += f"{self.eoi_token}"
+
+                    sample = sample.replace(self.image_token, img_string, 1)
+                processed_text.append(sample)
+            text = [sample.replace("<placeholder>", self.image_token) for sample in processed_text]
+
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"], return_tensors=None)
+
+        if return_mm_token_type_ids:
+            array_ids = np.array(text_inputs["input_ids"])
+            mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
+            mm_token_type_ids[np.isin(array_ids, self.image_ids)] = 1
+            text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
+
+        return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
+
+    def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
+        """
+        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
+
+        Args:
+            image_sizes (`list[list[int]]`, *optional*):
+                The input sizes formatted as (height, width) per each image.
+
+        Returns:
+            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+            input modalities, along with other useful data.
+        """
+
+        vision_data = {}
+        if image_sizes is not None:
+            images_kwargs = Cohere2VisionProcessorKwargs._defaults.get("images_kwargs", {})
+            images_kwargs.update(kwargs)
+
+            num_image_patches = [
+                self.image_processor.get_number_of_image_patches(*image_size, images_kwargs)
+                for image_size in image_sizes
+            ]
+
+            token_per_patch = int(self.patch_size**2)
+            num_image_tokens = [
+                2 + sum(token_per_patch + 1 for _ in range(num_patches)) for num_patches in num_image_patches
+            ]  # Add +2 and +1 for BOI/EOI and image break tokens
+            vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
+
+        return MultiModalData(**vision_data)
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(tokenizer_input_names) + list(image_processor_input_names)
+
+
+__all__ = ["Cohere2VisionProcessor"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/dac/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/dac/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..40f84ccb59be91ad440ba413366dd726e555ca26
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/dac/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_dac import *
+    from .feature_extraction_dac import *
+    from .modeling_dac import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/dac/configuration_dac.py b/venv/lib/python3.13/site-packages/transformers/models/dac/configuration_dac.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdf07cef45bde6a6064d4b6b02eb69f4f077e018
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/dac/configuration_dac.py
@@ -0,0 +1,114 @@
+# coding=utf-8
+# Copyright 2024 Descript and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Dac model configuration"""
+
+import math
+
+import numpy as np
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class DacConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`DacModel`]. It is used to instantiate a
+    Dac model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the
+    [descript/dac_16khz](https://huggingface.co/descript/dac_16khz) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        encoder_hidden_size (`int`, *optional*, defaults to 64):
+            Intermediate representation dimension for the encoder.
+        downsampling_ratios (`list[int]`, *optional*, defaults to `[2, 4, 8, 8]`):
+            Ratios for downsampling in the encoder. These are used in reverse order for upsampling in the decoder.
+        decoder_hidden_size (`int`, *optional*, defaults to 1536):
+            Intermediate representation dimension for the decoder.
+        n_codebooks (`int`, *optional*, defaults to 9):
+            Number of codebooks in the VQVAE.
+        codebook_size (`int`, *optional*, defaults to 1024):
+            Number of discrete codes in each codebook.
+        codebook_dim (`int`, *optional*, defaults to 8):
+            Dimension of the codebook vectors. If not defined, uses `encoder_hidden_size`.
+        quantizer_dropout (`bool`, *optional*, defaults to 0):
+            Whether to apply dropout to the quantizer.
+        commitment_loss_weight (float, *optional*, defaults to 0.25):
+            Weight of the commitment loss term in the VQVAE loss function.
+        codebook_loss_weight (float, *optional*, defaults to 1.0):
+            Weight of the codebook loss term in the VQVAE loss function.
+        sampling_rate (`int`, *optional*, defaults to 16000):
+            The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
+    Example:
+
+    ```python
+    >>> from transformers import DacModel, DacConfig
+
+    >>> # Initializing a "descript/dac_16khz" style configuration
+    >>> configuration = DacConfig()
+
+    >>> # Initializing a model (with random weights) from the "descript/dac_16khz" style configuration
+    >>> model = DacModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "dac"
+
+    def __init__(
+        self,
+        encoder_hidden_size=64,
+        downsampling_ratios=[2, 4, 8, 8],
+        decoder_hidden_size=1536,
+        n_codebooks=9,
+        codebook_size=1024,
+        codebook_dim=8,
+        quantizer_dropout=0,
+        commitment_loss_weight=0.25,
+        codebook_loss_weight=1.0,
+        sampling_rate=16000,
+        **kwargs,
+    ):
+        self.encoder_hidden_size = encoder_hidden_size
+        self.downsampling_ratios = downsampling_ratios
+        self.decoder_hidden_size = decoder_hidden_size
+        self.upsampling_ratios = downsampling_ratios[::-1]
+        self.n_codebooks = n_codebooks
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+        self.quantizer_dropout = quantizer_dropout
+        self.sampling_rate = sampling_rate
+
+        self.hidden_size = encoder_hidden_size * (2 ** len(downsampling_ratios))
+
+        self.hop_length = int(np.prod(downsampling_ratios))
+        self.commitment_loss_weight = commitment_loss_weight
+        self.codebook_loss_weight = codebook_loss_weight
+
+        super().__init__(**kwargs)
+
+    @property
+    def frame_rate(self) -> int:
+        hop_length = np.prod(self.upsampling_ratios)
+        return math.ceil(self.sampling_rate / hop_length)
+
+
+__all__ = ["DacConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/dac/feature_extraction_dac.py b/venv/lib/python3.13/site-packages/transformers/models/dac/feature_extraction_dac.py
new file mode 100644
index 0000000000000000000000000000000000000000..e81a2466dd91faa91ad9a34590825aa55e9497db
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/dac/feature_extraction_dac.py
@@ -0,0 +1,174 @@
+# coding=utf-8
+# Copyright 2024 Descript and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for DAC"""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ...feature_extraction_utils import BatchFeature
+from ...utils import PaddingStrategy, TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class DacFeatureExtractor(SequenceFeatureExtractor):
+    r"""
+    Constructs an Dac feature extractor.
+
+    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
+    most of the main methods. Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        feature_size (`int`, *optional*, defaults to 1):
+            The feature dimension of the extracted features. Use 1 for mono, 2 for stereo.
+        sampling_rate (`int`, *optional*, defaults to 16000):
+            The sampling rate at which the audio waveform should be digitalized, expressed in hertz (Hz).
+        padding_value (`float`, *optional*, defaults to 0.0):
+            The value that is used for padding.
+        hop_length (`int`, *optional*, defaults to 512):
+            Overlap length between successive windows.
+    """
+
+    model_input_names = ["input_values", "n_quantizers"]
+
+    def __init__(
+        self,
+        feature_size: int = 1,
+        sampling_rate: int = 16000,
+        padding_value: float = 0.0,
+        hop_length: int = 512,
+        **kwargs,
+    ):
+        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
+        self.hop_length = hop_length
+
+    def __call__(
+        self,
+        raw_audio: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]],
+        padding: Optional[Union[bool, str, PaddingStrategy]] = None,
+        truncation: Optional[bool] = False,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        sampling_rate: Optional[int] = None,
+    ) -> BatchFeature:
+        """
+        Main method to featurize and prepare for the model one or several sequence(s).
+
+        Args:
+            raw_audio (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
+                The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
+                values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape
+                `(num_samples,)` for mono audio (`feature_size = 1`), or `(2, num_samples)` for stereo audio
+                (`feature_size = 2`).
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            truncation (`bool`, *optional*, defaults to `False`):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*, default to 'pt'):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors.
+        """
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
+                    f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with"
+                    f" {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        if padding and truncation:
+            raise ValueError("Both padding and truncation were set. Make sure you only set one.")
+        elif padding is None:
+            # by default let's pad the inputs
+            padding = True
+
+        is_batched = bool(
+            isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list)))
+        )
+
+        if is_batched:
+            raw_audio = [np.asarray(audio, dtype=np.float32).T for audio in raw_audio]
+        elif not is_batched and not isinstance(raw_audio, np.ndarray):
+            raw_audio = np.asarray(raw_audio, dtype=np.float32)
+        elif isinstance(raw_audio, np.ndarray) and raw_audio.dtype is np.dtype(np.float64):
+            raw_audio = raw_audio.astype(np.float32)
+
+        # always return batch
+        if not is_batched:
+            raw_audio = [np.asarray(raw_audio).T]
+
+        # verify inputs are valid
+        for idx, example in enumerate(raw_audio):
+            if example.ndim > 2:
+                raise ValueError(f"Expected input shape (channels, length) but got shape {example.shape}")
+            if self.feature_size == 1 and example.ndim != 1:
+                raise ValueError(f"Expected mono audio but example has {example.shape[-1]} channels")
+            if self.feature_size == 2:
+                raise ValueError("Stereo audio isn't supported for now")
+
+        input_values = BatchFeature({"input_values": raw_audio})
+
+        # normal padding on batch
+        padded_inputs = self.pad(
+            input_values,
+            max_length=max_length,
+            truncation=truncation,
+            padding=padding,
+            return_attention_mask=padding,
+            pad_to_multiple_of=self.hop_length,
+        )
+        if padding:
+            padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")
+        if padding:
+            padded_inputs.input_values = padded_inputs.input_values[:, np.newaxis, :]
+
+        input_values = []
+        for example in padded_inputs.pop("input_values"):
+            if self.feature_size == 1:
+                example = example[..., None]
+            input_values.append(example.T)
+
+        padded_inputs["input_values"] = input_values
+        if return_tensors is not None:
+            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
+
+        return padded_inputs
+
+
+__all__ = ["DacFeatureExtractor"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/dac/modeling_dac.py b/venv/lib/python3.13/site-packages/transformers/models/dac/modeling_dac.py
new file mode 100644
index 0000000000000000000000000000000000000000..e97c8183651e1fa45d0c0886e258aa564b701239
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/dac/modeling_dac.py
@@ -0,0 +1,685 @@
+# coding=utf-8
+# Copyright 2024 Descript and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transformers DAC model."""
+
+import math
+from dataclasses import dataclass
+from typing import Optional
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ...modeling_utils import PreTrainedAudioTokenizerBase
+from ...utils import ModelOutput, auto_docstring
+from .configuration_dac import DacConfig
+
+
+@dataclass
+@auto_docstring
+class DacOutput(ModelOutput):
+    r"""
+    loss (`torch.Tensor`):
+        Loss from the encoder model, comprising the weighted combination of the commitment and codebook losses.
+    audio_values (`torch.Tensor` of shape `(batch_size, input_length)`):
+        Reconstructed audio data.
+    quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
+        Quantized continuous representation of input.
+    audio_codes (`torch.LongTensor` of shape `(batch_size, num_codebooks, time_steps)`):
+        Codebook indices for each codebook (quantized discrete representation of input).
+    projected_latents (`torch.Tensor` of shape `(batch_size, num_codebooks * dimension, time_steps)`):
+        Projected latents (continuous representation of input before quantization).
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    audio_values: Optional[torch.FloatTensor] = None
+    quantized_representation: Optional[torch.FloatTensor] = None
+    audio_codes: Optional[torch.LongTensor] = None
+    projected_latents: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+@auto_docstring
+class DacEncoderOutput(ModelOutput):
+    r"""
+    loss (`torch.Tensor`):
+        Loss from the encoder model, comprising the weighted combination of the commitment and codebook losses.
+    quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`, *optional*):
+        Quantized continuous representation of input.
+    audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`, *optional*):
+        Codebook indices for each codebook (quantized discrete representation of input).
+    projected_latents (`torch.Tensor` of shape `(batch_size, num_codebooks * dimension, time_steps)`, *optional*):
+        Projected latents (continuous representation of input before quantization).
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    quantized_representation: Optional[torch.FloatTensor] = None
+    audio_codes: Optional[torch.FloatTensor] = None
+    projected_latents: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+@auto_docstring
+# Copied from transformers.models.encodec.modeling_encodec.EncodecDecoderOutput with Encodec->Dac, segment_length->input_length
+class DacDecoderOutput(ModelOutput):
+    r"""
+    audio_values (`torch.FloatTensor`  of shape `(batch_size, input_length)`, *optional*):
+        Decoded audio values, obtained using the decoder part of Dac.
+    """
+
+    audio_values: Optional[torch.FloatTensor] = None
+
+
+class Snake1d(nn.Module):
+    """
+    A 1-dimensional Snake activation function module.
+    """
+
+    def __init__(self, hidden_dim):
+        super().__init__()
+        self.alpha = nn.Parameter(torch.ones(1, hidden_dim, 1))
+
+    def forward(self, hidden_states):
+        shape = hidden_states.shape
+        hidden_states = hidden_states.reshape(shape[0], shape[1], -1)
+        hidden_states = hidden_states + (self.alpha + 1e-9).reciprocal() * torch.sin(self.alpha * hidden_states).pow(2)
+        hidden_states = hidden_states.reshape(shape)
+        return hidden_states
+
+
+class DacVectorQuantize(nn.Module):
+    """
+    Implementation of VQ similar to Karpathy's repo (https://github.com/karpathy/deep-vector-quantization)
+
+    Additionally uses following tricks from improved VQGAN
+    (https://huggingface.co/papers/2110.04627):
+        1. Factorized codes: Perform nearest neighbor lookup in low-dimensional space
+            for improved codebook usage
+        2. l2-normalized codes: Converts euclidean distance to cosine similarity which
+            improves training stability
+    """
+
+    def __init__(self, config: DacConfig):
+        super().__init__()
+
+        self.codebook_dim = config.codebook_dim
+        self.in_proj = nn.Conv1d(config.hidden_size, config.codebook_dim, kernel_size=1)
+        self.out_proj = nn.Conv1d(config.codebook_dim, config.hidden_size, kernel_size=1)
+        self.codebook = nn.Embedding(config.codebook_size, config.codebook_dim)
+
+    def forward(self, hidden_state):
+        """
+        Quantizes the input tensor using a fixed codebook and returns the corresponding codebook vectors.
+
+        Args:
+            hidden_state (`torch.FloatTensor` of shape `(batch_size, dimension, time_steps)`):
+                Input tensor.
+
+        Returns:
+            quantized_representation (`torch.Tensor`of shape `(batch_size, dimension, time_steps)`):
+                Quantized continuous representation of input.
+            commitment_loss (`torch.FloatTensor`of shape `(1)`):
+                Commitment loss to train encoder to predict vectors closer to codebook entries.
+            codebook_loss (`torch.FloatTensor`of shape `(1)`):
+                Codebook loss to update the codebook.
+            audio_codes (`torch.LongTensor` of shape `(batch_size, time_steps)`):
+                Codebook indices for each codebook, quantized discrete representation of input.
+            projected_latents (torch.FloatTensor of shape `(batch_size, num_codebooks * dimension, time_steps)`):
+                Projected latents (continuous representation of input before quantization).
+        """
+
+        projected_latents = self.in_proj(hidden_state)
+        quantized_representation, audio_codes = self.decode_latents(projected_latents)
+
+        commitment_loss = F.mse_loss(projected_latents, quantized_representation.detach(), reduction="mean")
+        codebook_loss = F.mse_loss(quantized_representation, projected_latents.detach(), reduction="mean")
+        # noop in forward pass, straight-through gradient estimator in backward pass
+        quantized_representation = projected_latents + (quantized_representation - projected_latents).detach()
+        quantized_representation = self.out_proj(quantized_representation)
+
+        return quantized_representation, commitment_loss, codebook_loss, audio_codes, projected_latents
+
+    def decode_latents(self, hidden_states):
+        batch_size, hidden_dim, sequence_length = hidden_states.shape
+        encodings = hidden_states.permute(0, 2, 1).reshape(batch_size * sequence_length, hidden_dim)
+        codebook = self.codebook.weight  # codebook: (N x D)
+
+        # L2 normalize encodings and codebook (ViT-VQGAN)
+        encodings = F.normalize(encodings)
+        codebook = F.normalize(codebook)
+
+        # Compute euclidean distance with codebook
+        l2_norm = encodings.pow(2).sum(1, keepdim=True)
+        dist = -(l2_norm - 2 * encodings @ codebook.t()) + codebook.pow(2).sum(1, keepdim=True).t()
+
+        indices = dist.max(1)[1]
+        indices = indices.reshape(hidden_states.size(0), -1)
+        quantized_representation = self.codebook(indices).transpose(1, 2)
+        return quantized_representation, indices
+
+
+class DacResidualUnit(nn.Module):
+    """
+    A residual unit composed of Snake1d and weight-normalized Conv1d layers with dilations.
+    """
+
+    def __init__(self, dimension: int = 16, dilation: int = 1):
+        super().__init__()
+        pad = ((7 - 1) * dilation) // 2
+
+        self.snake1 = Snake1d(dimension)
+        self.conv1 = nn.Conv1d(dimension, dimension, kernel_size=7, dilation=dilation, padding=pad)
+        self.snake2 = Snake1d(dimension)
+        self.conv2 = nn.Conv1d(dimension, dimension, kernel_size=1)
+
+    def forward(self, hidden_state):
+        """
+        Forward pass through the residual unit.
+
+        Args:
+            hidden_state (`torch.Tensor` of shape `(batch_size, channels, time_steps)`):
+                Input tensor .
+
+        Returns:
+            output_tensor (`torch.Tensor` of shape `(batch_size, channels, time_steps)`):
+                Input tensor after passing through the residual unit.
+        """
+        output_tensor = hidden_state
+        output_tensor = self.conv1(self.snake1(output_tensor))
+        output_tensor = self.conv2(self.snake2(output_tensor))
+
+        padding = (hidden_state.shape[-1] - output_tensor.shape[-1]) // 2
+        if padding > 0:
+            hidden_state = hidden_state[..., padding:-padding]
+        output_tensor = hidden_state + output_tensor
+        return output_tensor
+
+
+class DacEncoderBlock(nn.Module):
+    """Encoder block used in DAC encoder."""
+
+    def __init__(self, config: DacConfig, stride: int = 1, stride_index: int = 1):
+        super().__init__()
+
+        dimension = config.encoder_hidden_size * 2**stride_index
+        self.res_unit1 = DacResidualUnit(dimension // 2, dilation=1)
+        self.res_unit2 = DacResidualUnit(dimension // 2, dilation=3)
+        self.res_unit3 = DacResidualUnit(dimension // 2, dilation=9)
+        self.snake1 = Snake1d(dimension // 2)
+        self.conv1 = nn.Conv1d(
+            dimension // 2, dimension, kernel_size=2 * stride, stride=stride, padding=math.ceil(stride / 2)
+        )
+
+    def forward(self, hidden_state):
+        hidden_state = self.res_unit1(hidden_state)
+        hidden_state = self.res_unit2(hidden_state)
+        hidden_state = self.snake1(self.res_unit3(hidden_state))
+        hidden_state = self.conv1(hidden_state)
+
+        return hidden_state
+
+
+class DacDecoderBlock(nn.Module):
+    """Decoder block used in DAC decoder."""
+
+    def __init__(self, config: DacConfig, stride: int = 1, stride_index: int = 1):
+        super().__init__()
+
+        input_dim = config.decoder_hidden_size // 2**stride_index
+        output_dim = config.decoder_hidden_size // 2 ** (stride_index + 1)
+        self.snake1 = Snake1d(input_dim)
+        self.conv_t1 = nn.ConvTranspose1d(
+            input_dim,
+            output_dim,
+            kernel_size=2 * stride,
+            stride=stride,
+            padding=math.ceil(stride / 2),
+        )
+
+        self.res_unit1 = DacResidualUnit(output_dim, dilation=1)
+        self.res_unit2 = DacResidualUnit(output_dim, dilation=3)
+        self.res_unit3 = DacResidualUnit(output_dim, dilation=9)
+
+    def forward(self, hidden_state):
+        hidden_state = self.snake1(hidden_state)
+        hidden_state = self.conv_t1(hidden_state)
+        hidden_state = self.res_unit1(hidden_state)
+        hidden_state = self.res_unit2(hidden_state)
+        hidden_state = self.res_unit3(hidden_state)
+
+        return hidden_state
+
+
+class DacResidualVectorQuantize(nn.Module):
+    """
+    ResidualVectorQuantize block - Introduced in SoundStream: An end2end neural audio codec (https://huggingface.co/papers/2107.03312)
+    """
+
+    def __init__(self, config: DacConfig):
+        super().__init__()
+
+        n_codebooks = config.n_codebooks
+        quantizer_dropout = config.quantizer_dropout
+
+        self.n_codebooks = n_codebooks
+
+        self.quantizers = nn.ModuleList([DacVectorQuantize(config) for i in range(config.n_codebooks)])
+        self.quantizer_dropout = quantizer_dropout
+
+    def forward(self, hidden_state, n_quantizers: Optional[int] = None):
+        """
+        Quantizes the input tensor using a fixed set of codebooks and returns corresponding codebook vectors.
+        Args:
+            hidden_state (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
+                Input tensor to be quantized.
+            n_quantizers (`int`, *optional*):
+                Number of quantizers to use. If specified and `self.quantizer_dropout` is True,
+                this argument is ignored during training, and a random number of quantizers is used.
+
+        Returns:
+            quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
+                Quantized continuous representation of input.
+            audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`):
+                Codebook indices for each codebook (quantized discrete representation of input).
+            projected_latents (`torch.Tensor` of shape `(batch_size, num_codebooks * dimension, time_steps)`):
+                Projected latents (continuous representation of input before quantization).
+            commitment_loss (`torch.Tensor` of shape `(1)`):
+                Commitment loss to train the encoder to predict vectors closer to codebook entries.
+            codebook_loss (`torch.Tensor` of shape `(1)`):
+                Codebook loss to update the codebook.
+        """
+
+        quantized_representation = 0
+        residual = hidden_state
+        commitment_loss = 0
+        codebook_loss = 0
+
+        audio_codes = []
+        projected_latents = []
+
+        n_quantizers = n_quantizers if n_quantizers is not None else self.n_codebooks
+        if self.training:
+            n_quantizers = torch.ones((hidden_state.shape[0],)) * self.n_codebooks + 1
+            dropout = torch.randint(1, self.n_codebooks + 1, (hidden_state.shape[0],))
+            n_dropout = int(hidden_state.shape[0] * self.quantizer_dropout)
+            n_quantizers[:n_dropout] = dropout[:n_dropout]
+            n_quantizers = n_quantizers.to(hidden_state.device)
+
+        for i, quantizer in enumerate(self.quantizers):
+            if self.training is False and i >= n_quantizers:
+                break
+
+            quantized_representation_i, commitment_loss_i, codebook_loss_i, indices_i, projected_latents_i = quantizer(
+                residual
+            )
+
+            # Create mask to apply quantizer dropout
+            mask = torch.full((hidden_state.shape[0],), fill_value=i, device=hidden_state.device) < n_quantizers
+            quantized_representation = quantized_representation + quantized_representation_i * mask[:, None, None]
+            residual = residual - quantized_representation_i
+
+            # Sum losses
+            commitment_loss += commitment_loss_i * mask
+            codebook_loss += codebook_loss_i * mask
+
+            audio_codes.append(indices_i)
+            projected_latents.append(projected_latents_i)
+
+        audio_codes = torch.stack(audio_codes, dim=1)
+        projected_latents = torch.cat(projected_latents, dim=1)
+
+        return quantized_representation, audio_codes, projected_latents, commitment_loss, codebook_loss
+
+    def from_codes(self, audio_codes: torch.Tensor):
+        """
+        Reconstructs the continuous representation from quantized codes.
+
+        Args:
+            audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`):
+                Quantized discrete representation of input.
+
+        Returns:
+            quantized_representation (`torch.Tensor`):
+                Quantized continuous representation of input.
+            projected_latents (`torch.Tensor`):
+                List of projected latents (continuous representations of input before quantization)
+                for each codebook.
+            audio_codes (`torch.Tensor`):
+                Codebook indices for each codebook.
+        """
+        quantized_representation = 0.0
+        projected_latents = []
+        n_codebooks = audio_codes.shape[1]
+        for i in range(n_codebooks):
+            projected_latents_i = self.quantizers[i].codebook(audio_codes[:, i, :]).transpose(1, 2)
+            projected_latents.append(projected_latents_i)
+            quantized_representation += self.quantizers[i].out_proj(projected_latents_i)
+        return quantized_representation, torch.cat(projected_latents, dim=1), audio_codes
+
+    def from_latents(self, latents: torch.Tensor):
+        """Reconstructs the quantized representation from unquantized latents.
+
+        Args:
+            latents (`torch.Tensor` of shape `(batch_size, total_latent_dimension, time_steps)`):
+                Continuous representation of input after projection.
+
+        Returns:
+            quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
+                Quantized representation of the full-projected space.
+            quantized_latents (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
+                Quantized representation of the latent space (continuous representation before quantization).
+        """
+        quantized_representation = 0
+        quantized_latents = []
+        codes = []
+        codebook_dims_tensor = torch.tensor([0] + [q.codebook_dim for q in self.quantizers])
+        dims = torch.cumsum(codebook_dims_tensor, dim=0)
+
+        n_codebooks = np.where(dims <= latents.shape[1])[0].max(axis=0, keepdims=True)[0]
+        for i in range(n_codebooks):
+            hidden_dim_j, hidden_dim_k = dims[i], dims[i + 1]
+            quantized_latents_i, codes_i = self.quantizers[i].decode_latents(latents[:, hidden_dim_j:hidden_dim_k, :])
+            quantized_latents.append(quantized_latents_i)
+            codes.append(codes_i)
+
+            quantized_representation_i = self.quantizers[i].out_proj(quantized_latents_i)
+            quantized_representation = quantized_representation + quantized_representation_i
+
+        return quantized_representation, torch.cat(quantized_latents, dim=1)
+
+
+class DacDecoder(nn.Module):
+    """DAC Decoder"""
+
+    def __init__(self, config: DacConfig):
+        super().__init__()
+
+        input_channel = config.hidden_size
+        channels = config.decoder_hidden_size
+        strides = config.upsampling_ratios
+
+        # Add first conv layer
+        self.conv1 = nn.Conv1d(input_channel, channels, kernel_size=7, padding=3)
+
+        # Add upsampling + MRF blocks
+        block = []
+        for stride_index, stride in enumerate(strides):
+            block += [DacDecoderBlock(config, stride, stride_index)]
+
+        self.block = nn.ModuleList(block)
+        output_dim = config.decoder_hidden_size // 2 ** (stride_index + 1)
+        self.snake1 = Snake1d(output_dim)
+        self.conv2 = nn.Conv1d(output_dim, 1, kernel_size=7, padding=3)
+        self.tanh = nn.Tanh()
+
+    def forward(self, hidden_state):
+        hidden_state = self.conv1(hidden_state)
+
+        for layer in self.block:
+            hidden_state = layer(hidden_state)
+
+        hidden_state = self.snake1(hidden_state)
+        hidden_state = self.conv2(hidden_state)
+        hidden_state = self.tanh(hidden_state)
+
+        return hidden_state
+
+
+class DacEncoder(nn.Module):
+    """DAC Encoder"""
+
+    def __init__(self, config: DacConfig):
+        super().__init__()
+
+        strides = config.downsampling_ratios
+        # Create first convolution
+        self.conv1 = nn.Conv1d(1, config.encoder_hidden_size, kernel_size=7, padding=3)
+
+        self.block = []
+        # Create EncoderBlocks that double channels as they downsample by `stride`
+        for stride_index, stride in enumerate(strides):
+            stride_index = stride_index + 1
+            self.block += [DacEncoderBlock(config, stride=stride, stride_index=stride_index)]
+
+        self.block = nn.ModuleList(self.block)
+        d_model = config.encoder_hidden_size * 2**stride_index
+        self.snake1 = Snake1d(d_model)
+        self.conv2 = nn.Conv1d(d_model, config.hidden_size, kernel_size=3, padding=1)
+
+    def forward(self, hidden_state):
+        hidden_state = self.conv1(hidden_state)
+
+        for module in self.block:
+            hidden_state = module(hidden_state)
+
+        hidden_state = self.snake1(hidden_state)
+        hidden_state = self.conv2(hidden_state)
+
+        return hidden_state
+
+
+@auto_docstring
+class DacPreTrainedModel(PreTrainedAudioTokenizerBase):
+    config: DacConfig
+    base_model_prefix = "dac"
+    main_input_name = "input_values"
+
+    def _init_weights(self, module):
+        if isinstance(module, nn.Conv1d):
+            nn.init.trunc_normal_(module.weight, std=0.02)
+            nn.init.constant_(module.bias, 0)
+        elif isinstance(module, Snake1d):
+            module.alpha.data.fill_(1.0)
+        elif isinstance(module, nn.ConvTranspose1d):
+            module.reset_parameters()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+
+    def apply_weight_norm(self):
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        for layer in self.quantizer.quantizers:
+            weight_norm(layer.in_proj)
+            weight_norm(layer.out_proj)
+
+        weight_norm(self.encoder.conv1)
+        weight_norm(self.encoder.conv2)
+
+        for layer in self.encoder.block:
+            weight_norm(layer.conv1)
+            weight_norm(layer.res_unit1.conv1)
+            weight_norm(layer.res_unit1.conv2)
+            weight_norm(layer.res_unit2.conv1)
+            weight_norm(layer.res_unit2.conv2)
+            weight_norm(layer.res_unit3.conv1)
+            weight_norm(layer.res_unit3.conv2)
+
+        weight_norm(self.decoder.conv1)
+        weight_norm(self.decoder.conv2)
+
+        for layer in self.decoder.block:
+            weight_norm(layer.conv_t1)
+            weight_norm(layer.res_unit1.conv1)
+            weight_norm(layer.res_unit1.conv2)
+            weight_norm(layer.res_unit2.conv1)
+            weight_norm(layer.res_unit2.conv2)
+            weight_norm(layer.res_unit3.conv1)
+            weight_norm(layer.res_unit3.conv2)
+
+    def remove_weight_norm(self):
+        for layer in self.quantizer.quantizers:
+            nn.utils.remove_weight_norm(layer.in_proj)
+            nn.utils.remove_weight_norm(layer.out_proj)
+
+        nn.utils.remove_weight_norm(self.encoder.conv1)
+        nn.utils.remove_weight_norm(self.encoder.conv2)
+
+        for layer in self.encoder.block:
+            nn.utils.remove_weight_norm(layer.conv1)
+            nn.utils.remove_weight_norm(layer.res_unit1.conv1)
+            nn.utils.remove_weight_norm(layer.res_unit1.conv2)
+            nn.utils.remove_weight_norm(layer.res_unit2.conv1)
+            nn.utils.remove_weight_norm(layer.res_unit2.conv2)
+            nn.utils.remove_weight_norm(layer.res_unit3.conv1)
+            nn.utils.remove_weight_norm(layer.res_unit3.conv2)
+
+        nn.utils.remove_weight_norm(self.decoder.conv1)
+        nn.utils.remove_weight_norm(self.decoder.conv2)
+
+        for layer in self.decoder.block:
+            nn.utils.remove_weight_norm(layer.conv_t1)
+            nn.utils.remove_weight_norm(layer.res_unit1.conv1)
+            nn.utils.remove_weight_norm(layer.res_unit1.conv2)
+            nn.utils.remove_weight_norm(layer.res_unit2.conv1)
+            nn.utils.remove_weight_norm(layer.res_unit2.conv2)
+            nn.utils.remove_weight_norm(layer.res_unit3.conv1)
+            nn.utils.remove_weight_norm(layer.res_unit3.conv2)
+
+
+@auto_docstring(
+    custom_intro="""
+    The DAC (Descript Audio Codec) model.
+    """
+)
+class DacModel(DacPreTrainedModel):
+    def __init__(self, config: DacConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.encoder = DacEncoder(config)
+        self.decoder = DacDecoder(config)
+
+        self.quantizer = DacResidualVectorQuantize(config)
+
+        self.bits_per_codebook = int(math.log2(self.config.codebook_size))
+        if 2**self.bits_per_codebook != self.config.codebook_size:
+            raise ValueError("The codebook_size must be a power of 2.")
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def encode(
+        self,
+        input_values: torch.Tensor,
+        n_quantizers: Optional[int] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        input_values (`torch.Tensor of shape `(batch_size, 1, time_steps)`):
+            Input audio data to encode,
+        n_quantizers (int, *optional*):
+            Number of quantizers to use. If None, all quantizers are used. Default is None.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        quantized_representation = self.encoder(input_values)
+        quantized_representation, audio_codes, projected_latents, commitment_loss, codebook_loss = self.quantizer(
+            quantized_representation, n_quantizers
+        )
+
+        loss = self.config.commitment_loss_weight * commitment_loss + self.config.codebook_loss_weight * codebook_loss
+
+        if not return_dict:
+            return (loss, quantized_representation, audio_codes, projected_latents)
+
+        return DacEncoderOutput(loss, quantized_representation, audio_codes, projected_latents)
+
+    @auto_docstring
+    def decode(
+        self,
+        quantized_representation: Optional[torch.Tensor] = None,
+        audio_codes: Optional[torch.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        quantized_representation (torch.Tensor of shape `(batch_size, dimension, time_steps)`, *optional*):
+            Quantized continuous representation of input.
+        audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`, *optional*):
+            The codebook indices for each codebook, representing the quantized discrete
+            representation of the input. This parameter should be provided if you want
+            to decode directly from the audio codes (it will overwrite quantized_representation).
+        return_dict (`bool`, *optional*, defaults to `True`):
+            Whether to return a [`DacDecoderOutput`] instead of a plain tuple.
+        """
+
+        if quantized_representation is None and audio_codes is None:
+            raise ValueError("Either `quantized_representation` or `audio_codes` must be provided.")
+
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        if audio_codes is not None:
+            quantized_representation = self.quantizer.from_codes(audio_codes)[0]
+
+        audio_values = self.decoder(quantized_representation).squeeze(1)
+
+        if not return_dict:
+            return (audio_values,)
+
+        return DacDecoderOutput(audio_values)
+
+    @auto_docstring
+    def forward(
+        self,
+        input_values: torch.Tensor,
+        n_quantizers: Optional[int] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        input_values (`torch.Tensor` of shape `(batch_size, 1, time_steps)`):
+            Audio data to encode.
+        n_quantizers (`int`, *optional*):
+            Number of quantizers to use. If `None`, all quantizers are used. Default is `None`.
+
+        Examples:
+
+        ```python
+        >>> from datasets import load_dataset, Audio
+        >>> from transformers import DacModel, AutoProcessor
+        >>> librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+
+        >>> model = DacModel.from_pretrained("descript/dac_16khz")
+        >>> processor = AutoProcessor.from_pretrained("descript/dac_16khz")
+        >>> librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate))
+        >>> audio_sample = librispeech_dummy[-1]["audio"]["array"]
+        >>> inputs = processor(raw_audio=audio_sample, sampling_rate=processor.sampling_rate, return_tensors="pt")
+
+        >>> encoder_outputs = model.encode(inputs["input_values"])
+        >>> # Get the intermediate audio codes
+        >>> audio_codes = encoder_outputs.audio_codes
+        >>> # Reconstruct the audio from its quantized representation
+        >>> audio_values = model.decode(encoder_outputs.quantized_representation)
+        >>> # or the equivalent with a forward pass
+        >>> audio_values = model(inputs["input_values"]).audio_values
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        length = input_values.shape[-1]
+
+        loss, quantized_representation, audio_codes, projected_latents = self.encode(
+            input_values, n_quantizers, return_dict=False
+        )
+        audio_values = self.decode(quantized_representation, return_dict=False)[0][..., :length]
+
+        if not return_dict:
+            return (loss, audio_values, quantized_representation, audio_codes, projected_latents)
+
+        return DacOutput(loss, audio_values, quantized_representation, audio_codes, projected_latents)
+
+
+__all__ = ["DacModel", "DacPreTrainedModel"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deepseek_v2/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/deepseek_v2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9aaf5f0fccd2d155613c244c4f5391201712e44
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deepseek_v2/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_deepseek_v2 import *
+    from .modeling_deepseek_v2 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deepseek_v2/configuration_deepseek_v2.py b/venv/lib/python3.13/site-packages/transformers/models/deepseek_v2/configuration_deepseek_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0b25625ea2313b764337b90b3fdd76e32c21152
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deepseek_v2/configuration_deepseek_v2.py
@@ -0,0 +1,239 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/deepseek_v2/modular_deepseek_v2.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_deepseek_v2.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+
+
+class DeepseekV2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DeepseekV2Model`]. It is used to instantiate a DeepSeek
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of DeepSeek-V2-Lite" [deepseek-ai/DeepSeek-V2-Lite"](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite").
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the DeepSeek model. Defines the number of different tokens that can be represented by the
+            `input_ids` passed when calling [`DeepseekV2Model`].
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            The number of key-value heads used to implement Grouped Query Attention (GQA). If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi-Head Attention (MHA). If
+            `num_key_value_heads=1`, the model will use Multi-Query Attention (MQA). Otherwise, GQA is used.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated normal initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon value used by the RMS normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/value attentions (useful for inference optimization).
+        pad_token_id (`int`, *optional*):
+            Padding token ID.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning-of-sequence token ID.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End-of-sequence token ID.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie input and output embeddings.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the Rotary Position Embeddings (RoPE).
+        rope_scaling (`Dict`, *optional*):
+            Configuration for scaling RoPE embeddings. Supports `linear` and `dynamic` scaling strategies.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value, and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability applied to attention weights.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias term in the MLP layers.
+        aux_loss_alpha (`float`, *optional*, defaults to 0.001):
+            Weight coefficient for auxiliary loss in Mixture of Experts (MoE) models.
+        first_k_dense_replace (`int`, *optional*, defaults to 0):
+            Number of dense layers in the shallow layers before switching to MoE layers.
+        kv_lora_rank (`int`, *optional*, defaults to 512):
+            Rank of the LoRA decomposition for key-value projections.
+        q_lora_rank (`int`, *optional*, defaults to 1536):
+            Rank of the LoRA decomposition for query projections.
+            Specifically, it determines the dimensionality to which the query (q) vectors are compressed before being expanded back to their original size.
+            It reduces computational overhead while maintaining model performance.
+        n_group (`int`, *optional*):
+            Number of groups for routed experts.
+        n_routed_experts (`int`, *optional*, defaults to 64):
+            Number of routed experts (None indicates a dense model).
+        n_shared_experts (`int`, *optional*, defaults to 2):
+            Number of shared experts (None indicates a dense model).
+        qk_nope_head_dim (`int`, *optional*, defaults to 128):
+            The head dimension for the QK (query-key) projections when using NOPE (Neural Operator Position Encoding).
+        qk_rope_head_dim (`int`, *optional*, defaults to 64):
+            The head dimension for QK projections when using RoPE.
+        routed_scaling_factor (`float`, *optional*, defaults to 1.0):
+            Scaling factor for routed experts in MoE models.
+        seq_aux (`bool`, *optional*, defaults to `True`):
+            Whether to compute the auxiliary loss for each individual sequence.
+        topk_group (`int`, *optional*):
+            Number of selected groups per token for expert selection.
+        topk_method (`str`, *optional*, defaults to `"greedy"`):
+            The method used for selecting top-k experts in the routed gate mechanism.
+        v_head_dim (`int`, *optional*, defaults to 128):
+            The dimension of value projections in the attention layers.
+        num_experts_per_tok (`int`, *optional*):
+            The number of experts selected per token. If `None`, the model behaves as a dense Transformer.
+        norm_topk_prob (`bool`, *optional*, defaults to `False`):
+            Whether to normalize the probability distribution over top-k selected experts.
+        moe_intermediate_size (`int`, *optional*, defaults to 1407):
+            Dimension of the MoE (Mixture of Experts) representations.
+
+    ```python
+    >>> from transformers import DeepseekV2Model, DeepseekV2Config
+    >>> # Initializing a DeepSeek-V2 style configuration
+    >>> configuration = DeepseekV2Config()
+    >>> # Accessing the model configuration
+    >>> model = DeepseekV2Model(configuration)
+    >>> print(model.config)
+    ```
+    """
+
+    model_type = "deepseek_v2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.q_a_proj": "colwise",
+        "layers.*.self_attn.q_b_proj": "colwise",
+        "layers.*.self_attn.kv_b_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mlp_bias=False,
+        aux_loss_alpha=0.001,
+        first_k_dense_replace=0,
+        kv_lora_rank=512,
+        q_lora_rank=1536,
+        n_group=None,
+        n_routed_experts=64,
+        n_shared_experts=2,
+        qk_nope_head_dim=128,
+        qk_rope_head_dim=64,
+        routed_scaling_factor=1.0,
+        seq_aux=True,
+        topk_group=None,
+        topk_method="greedy",
+        v_head_dim=128,
+        num_experts_per_tok=None,
+        norm_topk_prob=False,
+        moe_intermediate_size=1407,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+        self.head_dim = qk_rope_head_dim
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, copy it it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+        self.aux_loss_alpha = aux_loss_alpha
+        self.first_k_dense_replace = first_k_dense_replace
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.n_group = n_group
+        self.n_routed_experts = n_routed_experts
+        self.n_shared_experts = n_shared_experts
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.routed_scaling_factor = routed_scaling_factor
+        self.seq_aux = seq_aux
+        self.topk_group = topk_group
+        self.topk_method = topk_method
+        self.v_head_dim = v_head_dim
+        self.num_experts_per_tok = num_experts_per_tok
+        self.norm_topk_prob = norm_topk_prob
+        self.moe_intermediate_size = moe_intermediate_size
+
+
+__all__ = ["DeepseekV2Config"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deepseek_v2/modeling_deepseek_v2.py b/venv/lib/python3.13/site-packages/transformers/models/deepseek_v2/modeling_deepseek_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f7a352573b2e0db87edc5e873c0fd1e325b9e77
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deepseek_v2/modeling_deepseek_v2.py
@@ -0,0 +1,640 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/deepseek_v2/modular_deepseek_v2.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_deepseek_v2.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from typing import Callable, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask
+from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import check_model_inputs
+from .configuration_deepseek_v2 import DeepseekV2Config
+
+
+class DeepseekV2MoEGate(nn.Module):
+    def __init__(self, config: DeepseekV2Config):
+        super().__init__()
+        self.config = config
+        self.top_k = config.num_experts_per_tok
+        self.num_experts = config.n_routed_experts
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.alpha = config.aux_loss_alpha
+        self.seq_aux = config.seq_aux
+        self.topk_method = config.topk_method
+        self.num_group = config.n_group
+        self.topk_group = config.topk_group
+
+        # topk selection algorithm
+        self.norm_topk_prob = config.norm_topk_prob
+        self.gating_dim = config.hidden_size
+        self.weight = nn.Parameter(torch.empty((self.num_experts, self.gating_dim)))
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, seq_len, hidden_dim = hidden_states.shape
+        ### compute gating score
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32), None)
+        scores = logits.softmax(dim=-1, dtype=torch.float32)
+
+        # select top-k experts
+        # greedy method is used for DeepSeek-V2-Lite
+        # group_limited_greedy for DeepSeek-V2 and DeepSeek-V2-Chat
+        if self.topk_method == "greedy":
+            topk_weight, topk_idx = torch.topk(scores, k=self.top_k, dim=-1, sorted=False)
+        elif self.topk_method == "group_limited_greedy":
+            group_scores = scores.view(batch_size * seq_len, self.num_group, -1).max(dim=-1).values  # [n, num_group]
+            group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1]  # [n, top_k_group]
+            group_mask = torch.zeros_like(group_scores)  # [n, num_group]
+            group_mask.scatter_(1, group_idx, 1)  # [n, num_group]
+            score_mask = (
+                group_mask.unsqueeze(-1)
+                .expand(batch_size * seq_len, self.num_group, self.num_experts // self.num_group)
+                .reshape(batch_size * seq_len, -1)
+            )  # [n, e]
+            tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
+            topk_weight, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=False)
+
+        topk_weight = topk_weight * self.routed_scaling_factor
+        ### expert-level computation auxiliary loss
+        return topk_idx, topk_weight
+
+
+class DeepseekV2MoE(nn.Module):
+    """
+    A mixed expert module containing shared experts.
+    """
+
+    def __init__(self, config: DeepseekV2Config):
+        super().__init__()
+        self.config = config
+        self.num_experts_per_tok = config.num_experts_per_tok
+
+        self.experts = nn.ModuleList(
+            [
+                (DeepseekV2MLP(config, intermediate_size=config.moe_intermediate_size))
+                for _ in range(config.n_routed_experts)
+            ]
+        )
+        self.gate = DeepseekV2MoEGate(config)
+        if config.n_shared_experts is not None:
+            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
+            self.shared_experts = DeepseekV2MLP(config=config, intermediate_size=intermediate_size)
+        self.ep_rank = 0
+        self.experts_per_rank = config.n_routed_experts
+
+    def moe(self, hidden_states: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
+        cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
+        cnts.scatter_(1, topk_ids, 1)
+        tokens_per_expert = cnts.sum(dim=0)
+        indices = topk_ids.view(-1).argsort()
+        sorted_tokens = hidden_states[indices // topk_ids.shape[1]]
+
+        # Process experts
+        outputs = []
+        start_idx = 0
+        for i, num_tokens in enumerate(tokens_per_expert):
+            if num_tokens == 0:
+                continue
+            end_idx = start_idx + num_tokens
+            expert = self.experts[i + self.ep_rank * self.experts_per_rank]
+            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
+            expert_out = expert(tokens_for_this_expert)
+            outputs.append(expert_out)
+            start_idx = end_idx
+
+        outs = torch.cat(outputs, dim=0) if outputs else sorted_tokens.new_empty(0)
+
+        # Reorder and combine outputs
+        new_x = torch.empty_like(outs)
+        new_x[indices] = outs
+        hidden_states = (
+            new_x.view(*topk_ids.shape, -1)
+            .type(topk_weight.dtype)
+            .mul_(topk_weight.unsqueeze(dim=-1))
+            .sum(dim=1)
+            .type(new_x.dtype)
+        )
+        return hidden_states
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        residuals = hidden_states
+        orig_shape = hidden_states.shape
+        topk_indices, topk_weights = self.gate(hidden_states)
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+        hidden_states = self.moe(hidden_states, topk_indices, topk_weights).view(*orig_shape)
+        hidden_states = hidden_states + self.shared_experts(residuals)
+        return hidden_states
+
+
+class DeepseekV2MLP(nn.Module):
+    def __init__(self, config: DeepseekV2Config, hidden_size=None, intermediate_size=None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
+        self.intermediate_size = config.intermediate_size if intermediate_size is None else intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class DeepseekV2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        DeepseekV2RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class DeepseekV2RotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: DeepseekV2Config, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        self.rope_type = (
+            config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+            if config.rope_scaling is not None
+            else "default"
+        )
+
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.to(x.device) @ position_ids_expanded).transpose(1, 2)
+            freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # Convert to complex representation
+            freqs_cis = freqs_cis * self.attention_scaling
+
+        return freqs_cis
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+
+    # Broadcast to [1, 1, seq_len, dim // 2]
+    freqs_cis = freqs_cis.unsqueeze(1).to(xq_.device)
+
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3).type_as(xq)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3).type_as(xk)
+    return xq_out, xk_out
+
+
+class DeepseekV2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: DeepseekV2Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = config.head_dim
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.q_lora_rank = config.q_lora_rank
+        self.qk_rope_head_dim = config.qk_rope_head_dim
+        self.kv_lora_rank = config.kv_lora_rank
+        self.v_head_dim = config.v_head_dim
+        self.qk_nope_head_dim = config.qk_nope_head_dim
+        self.qk_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+
+        self.is_causal = True
+
+        if self.q_lora_rank is None:
+            self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.qk_head_dim, bias=False)
+        else:
+            self.q_a_proj = nn.Linear(self.hidden_size, config.q_lora_rank, bias=config.attention_bias)
+            self.q_a_layernorm = DeepseekV2RMSNorm(config.q_lora_rank)
+            self.q_b_proj = nn.Linear(config.q_lora_rank, self.num_heads * self.qk_head_dim, bias=False)
+
+        self.kv_a_proj_with_mqa = nn.Linear(
+            self.hidden_size,
+            config.kv_lora_rank + config.qk_rope_head_dim,
+            bias=config.attention_bias,
+        )
+        self.kv_a_layernorm = DeepseekV2RMSNorm(config.kv_lora_rank)
+        self.kv_b_proj = nn.Linear(
+            config.kv_lora_rank,
+            self.num_heads * (self.qk_head_dim - self.qk_rope_head_dim + self.v_head_dim),
+            bias=False,
+        )
+
+        self.o_proj = nn.Linear(
+            self.num_heads * self.v_head_dim,
+            self.hidden_size,
+            bias=config.attention_bias,
+        )
+
+        self.scaling = self.qk_head_dim ** (-0.5)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        batch_size, seq_length = hidden_states.shape[:-1]
+        query_shape = (batch_size, seq_length, -1, self.qk_head_dim)
+        key_shape = (batch_size, seq_length, -1, self.qk_nope_head_dim + self.v_head_dim)
+
+        if self.q_lora_rank is None:
+            q = self.q_proj(hidden_states)
+        else:
+            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
+        q = q.view(query_shape).transpose(1, 2)
+        q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+
+        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+        k_nope, k_pe = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        k_nope = self.kv_b_proj(self.kv_a_layernorm(k_nope)).view(key_shape).transpose(1, 2)
+        k_nope, value_states = torch.split(k_nope, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+        k_pe = k_pe.view(batch_size, 1, seq_length, self.qk_rope_head_dim)
+        q_pe, k_pe = apply_rotary_emb(q_pe, k_pe, position_embeddings.to(q_pe.device))
+
+        k_pe = k_pe.expand(*k_nope.shape[:-1], -1)
+        query_states = torch.cat((q_nope, q_pe), dim=-1)
+        key_states = torch.cat((k_nope, k_pe), dim=-1)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim:
+            value_states = F.pad(value_states, [0, self.qk_head_dim - self.v_head_dim])
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim:
+            attn_output = attn_output[:, :, :, : self.v_head_dim]
+
+        attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class DeepseekV2DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: DeepseekV2Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = DeepseekV2Attention(config=config, layer_idx=layer_idx)
+        self.mlp = DeepseekV2MoE(config) if layer_idx >= config.first_k_dense_replace else DeepseekV2MLP(config)
+
+        self.input_layernorm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+@auto_docstring
+class DeepseekV2PreTrainedModel(PreTrainedModel):
+    config: DeepseekV2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["DeepseekV2DecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _can_compile_fullgraph = False
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": DeepseekV2DecoderLayer,
+        "attentions": DeepseekV2Attention,
+    }
+
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, DeepseekV2MoEGate):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+
+@auto_docstring
+class DeepseekV2Model(DeepseekV2PreTrainedModel):
+    def __init__(self, config: DeepseekV2Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [DeepseekV2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = DeepseekV2RotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs()
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position: torch.Tensor = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+
+
+@auto_docstring
+class DeepseekV2ForCausalLM(DeepseekV2PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = DeepseekV2Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, DeepseekV2ForCausalLM
+
+        >>> model = DeepseekV2ForCausalLM.from_pretrained("meta-deepseek_v2/DeepseekV2-2-7b-hf")
+        >>> tokenizer = AutoTokenizer.from_pretrained("meta-deepseek_v2/DeepseekV2-2-7b-hf")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class DeepseekV2ForSequenceClassification(GenericForSequenceClassification, DeepseekV2PreTrainedModel):
+    pass
+
+
+__all__ = [
+    "DeepseekV2PreTrainedModel",
+    "DeepseekV2Model",
+    "DeepseekV2ForCausalLM",
+    "DeepseekV2ForSequenceClassification",
+]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deepseek_v2/modular_deepseek_v2.py b/venv/lib/python3.13/site-packages/transformers/models/deepseek_v2/modular_deepseek_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..80fa2c8860a3442a2904ec1061514a020290132f
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deepseek_v2/modular_deepseek_v2.py
@@ -0,0 +1,533 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from typing import Callable, Optional
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...cache_utils import Cache
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...utils import (
+    logging,
+)
+from ...utils.deprecation import deprecate_kwarg
+from ..llama.configuration_llama import LlamaConfig
+from ..llama.modeling_llama import (
+    LlamaDecoderLayer,
+    LlamaForCausalLM,
+    LlamaForSequenceClassification,
+    LlamaMLP,
+    LlamaModel,
+    LlamaPreTrainedModel,
+    LlamaRMSNorm,
+    eager_attention_forward,
+)
+from ..llama4.modeling_llama4 import Llama4TextRotaryEmbedding
+
+
+logger = logging.get_logger(__name__)
+
+
+class DeepseekV2Config(LlamaConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DeepseekV2Model`]. It is used to instantiate a DeepSeek
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of DeepSeek-V2-Lite" [deepseek-ai/DeepSeek-V2-Lite"](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite").
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the DeepSeek model. Defines the number of different tokens that can be represented by the
+            `input_ids` passed when calling [`DeepseekV2Model`].
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            The number of key-value heads used to implement Grouped Query Attention (GQA). If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi-Head Attention (MHA). If
+            `num_key_value_heads=1`, the model will use Multi-Query Attention (MQA). Otherwise, GQA is used.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated normal initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon value used by the RMS normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/value attentions (useful for inference optimization).
+        pad_token_id (`int`, *optional*):
+            Padding token ID.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning-of-sequence token ID.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End-of-sequence token ID.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie input and output embeddings.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the Rotary Position Embeddings (RoPE).
+        rope_scaling (`Dict`, *optional*):
+            Configuration for scaling RoPE embeddings. Supports `linear` and `dynamic` scaling strategies.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value, and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability applied to attention weights.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias term in the MLP layers.
+        aux_loss_alpha (`float`, *optional*, defaults to 0.001):
+            Weight coefficient for auxiliary loss in Mixture of Experts (MoE) models.
+        first_k_dense_replace (`int`, *optional*, defaults to 0):
+            Number of dense layers in the shallow layers before switching to MoE layers.
+        kv_lora_rank (`int`, *optional*, defaults to 512):
+            Rank of the LoRA decomposition for key-value projections.
+        q_lora_rank (`int`, *optional*, defaults to 1536):
+            Rank of the LoRA decomposition for query projections.
+            Specifically, it determines the dimensionality to which the query (q) vectors are compressed before being expanded back to their original size.
+            It reduces computational overhead while maintaining model performance.
+        n_group (`int`, *optional*):
+            Number of groups for routed experts.
+        n_routed_experts (`int`, *optional*, defaults to 64):
+            Number of routed experts (None indicates a dense model).
+        n_shared_experts (`int`, *optional*, defaults to 2):
+            Number of shared experts (None indicates a dense model).
+        qk_nope_head_dim (`int`, *optional*, defaults to 128):
+            The head dimension for the QK (query-key) projections when using NOPE (Neural Operator Position Encoding).
+        qk_rope_head_dim (`int`, *optional*, defaults to 64):
+            The head dimension for QK projections when using RoPE.
+        routed_scaling_factor (`float`, *optional*, defaults to 1.0):
+            Scaling factor for routed experts in MoE models.
+        seq_aux (`bool`, *optional*, defaults to `True`):
+            Whether to compute the auxiliary loss for each individual sequence.
+        topk_group (`int`, *optional*):
+            Number of selected groups per token for expert selection.
+        topk_method (`str`, *optional*, defaults to `"greedy"`):
+            The method used for selecting top-k experts in the routed gate mechanism.
+        v_head_dim (`int`, *optional*, defaults to 128):
+            The dimension of value projections in the attention layers.
+        num_experts_per_tok (`int`, *optional*):
+            The number of experts selected per token. If `None`, the model behaves as a dense Transformer.
+        norm_topk_prob (`bool`, *optional*, defaults to `False`):
+            Whether to normalize the probability distribution over top-k selected experts.
+        moe_intermediate_size (`int`, *optional*, defaults to 1407):
+            Dimension of the MoE (Mixture of Experts) representations.
+
+    ```python
+    >>> from transformers import DeepseekV2Model, DeepseekV2Config
+    >>> # Initializing a DeepSeek-V2 style configuration
+    >>> configuration = DeepseekV2Config()
+    >>> # Accessing the model configuration
+    >>> model = DeepseekV2Model(configuration)
+    >>> print(model.config)
+    ```
+    """
+
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.q_a_proj": "colwise",
+        "layers.*.self_attn.q_b_proj": "colwise",
+        "layers.*.self_attn.kv_b_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+
+    model_type = "deepseek_v2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mlp_bias=False,
+        aux_loss_alpha=0.001,
+        first_k_dense_replace=0,
+        kv_lora_rank=512,
+        q_lora_rank=1536,
+        n_group=None,
+        n_routed_experts=64,
+        n_shared_experts=2,
+        qk_nope_head_dim=128,
+        qk_rope_head_dim=64,
+        routed_scaling_factor=1.0,
+        seq_aux=True,
+        topk_group=None,
+        topk_method="greedy",
+        v_head_dim=128,
+        num_experts_per_tok=None,
+        norm_topk_prob=False,
+        moe_intermediate_size=1407,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        del self.pretraining_tp
+        self.aux_loss_alpha = aux_loss_alpha
+        self.first_k_dense_replace = first_k_dense_replace
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.n_group = n_group
+        self.n_routed_experts = n_routed_experts
+        self.n_shared_experts = n_shared_experts
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.routed_scaling_factor = routed_scaling_factor
+        self.seq_aux = seq_aux
+        self.topk_group = topk_group
+        self.topk_method = topk_method
+        self.v_head_dim = v_head_dim
+        self.num_experts_per_tok = num_experts_per_tok
+        self.norm_topk_prob = norm_topk_prob
+        self.moe_intermediate_size = moe_intermediate_size
+        self.head_dim = qk_rope_head_dim
+
+
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+
+    # Broadcast to [1, 1, seq_len, dim // 2]
+    freqs_cis = freqs_cis.unsqueeze(1).to(xq_.device)
+
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3).type_as(xq)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3).type_as(xk)
+    return xq_out, xk_out
+
+
+class DeepseekV2MoEGate(nn.Module):
+    def __init__(self, config: DeepseekV2Config):
+        super().__init__()
+        self.config = config
+        self.top_k = config.num_experts_per_tok
+        self.num_experts = config.n_routed_experts
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.alpha = config.aux_loss_alpha
+        self.seq_aux = config.seq_aux
+        self.topk_method = config.topk_method
+        self.num_group = config.n_group
+        self.topk_group = config.topk_group
+
+        # topk selection algorithm
+        self.norm_topk_prob = config.norm_topk_prob
+        self.gating_dim = config.hidden_size
+        self.weight = nn.Parameter(torch.empty((self.num_experts, self.gating_dim)))
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, seq_len, hidden_dim = hidden_states.shape
+        ### compute gating score
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32), None)
+        scores = logits.softmax(dim=-1, dtype=torch.float32)
+
+        # select top-k experts
+        # greedy method is used for DeepSeek-V2-Lite
+        # group_limited_greedy for DeepSeek-V2 and DeepSeek-V2-Chat
+        if self.topk_method == "greedy":
+            topk_weight, topk_idx = torch.topk(scores, k=self.top_k, dim=-1, sorted=False)
+        elif self.topk_method == "group_limited_greedy":
+            group_scores = scores.view(batch_size * seq_len, self.num_group, -1).max(dim=-1).values  # [n, num_group]
+            group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1]  # [n, top_k_group]
+            group_mask = torch.zeros_like(group_scores)  # [n, num_group]
+            group_mask.scatter_(1, group_idx, 1)  # [n, num_group]
+            score_mask = (
+                group_mask.unsqueeze(-1)
+                .expand(batch_size * seq_len, self.num_group, self.num_experts // self.num_group)
+                .reshape(batch_size * seq_len, -1)
+            )  # [n, e]
+            tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
+            topk_weight, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=False)
+
+        topk_weight = topk_weight * self.routed_scaling_factor
+        ### expert-level computation auxiliary loss
+        return topk_idx, topk_weight
+
+
+class DeepseekV2MoE(nn.Module):
+    """
+    A mixed expert module containing shared experts.
+    """
+
+    def __init__(self, config: DeepseekV2Config):
+        super().__init__()
+        self.config = config
+        self.num_experts_per_tok = config.num_experts_per_tok
+
+        self.experts = nn.ModuleList(
+            [
+                (DeepseekV2MLP(config, intermediate_size=config.moe_intermediate_size))
+                for _ in range(config.n_routed_experts)
+            ]
+        )
+        self.gate = DeepseekV2MoEGate(config)
+        if config.n_shared_experts is not None:
+            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
+            self.shared_experts = DeepseekV2MLP(config=config, intermediate_size=intermediate_size)
+        self.ep_rank = 0
+        self.experts_per_rank = config.n_routed_experts
+
+    def moe(self, hidden_states: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor:
+        cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
+        cnts.scatter_(1, topk_ids, 1)
+        tokens_per_expert = cnts.sum(dim=0)
+        indices = topk_ids.view(-1).argsort()
+        sorted_tokens = hidden_states[indices // topk_ids.shape[1]]
+
+        # Process experts
+        outputs = []
+        start_idx = 0
+        for i, num_tokens in enumerate(tokens_per_expert):
+            if num_tokens == 0:
+                continue
+            end_idx = start_idx + num_tokens
+            expert = self.experts[i + self.ep_rank * self.experts_per_rank]
+            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
+            expert_out = expert(tokens_for_this_expert)
+            outputs.append(expert_out)
+            start_idx = end_idx
+
+        outs = torch.cat(outputs, dim=0) if outputs else sorted_tokens.new_empty(0)
+
+        # Reorder and combine outputs
+        new_x = torch.empty_like(outs)
+        new_x[indices] = outs
+        hidden_states = (
+            new_x.view(*topk_ids.shape, -1)
+            .type(topk_weight.dtype)
+            .mul_(topk_weight.unsqueeze(dim=-1))
+            .sum(dim=1)
+            .type(new_x.dtype)
+        )
+        return hidden_states
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        residuals = hidden_states
+        orig_shape = hidden_states.shape
+        topk_indices, topk_weights = self.gate(hidden_states)
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+        hidden_states = self.moe(hidden_states, topk_indices, topk_weights).view(*orig_shape)
+        hidden_states = hidden_states + self.shared_experts(residuals)
+        return hidden_states
+
+
+class DeepseekV2MLP(LlamaMLP):
+    def __init__(self, config: DeepseekV2Config, hidden_size=None, intermediate_size=None):
+        super().__init__(config)
+        self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
+        self.intermediate_size = config.intermediate_size if intermediate_size is None else intermediate_size
+
+
+class DeepseekV2RMSNorm(LlamaRMSNorm):
+    pass
+
+
+class DeepseekV2RotaryEmbedding(Llama4TextRotaryEmbedding):
+    def __init__(self, config: DeepseekV2Config, device=None):
+        super().__init__(config=config, device=device)
+        # BC: "rope_type" was originally "type"
+        self.rope_type = (
+            config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+            if config.rope_scaling is not None
+            else "default"
+        )
+
+
+class DeepseekV2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: DeepseekV2Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = config.head_dim
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.q_lora_rank = config.q_lora_rank
+        self.qk_rope_head_dim = config.qk_rope_head_dim
+        self.kv_lora_rank = config.kv_lora_rank
+        self.v_head_dim = config.v_head_dim
+        self.qk_nope_head_dim = config.qk_nope_head_dim
+        self.qk_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+
+        self.is_causal = True
+
+        if self.q_lora_rank is None:
+            self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.qk_head_dim, bias=False)
+        else:
+            self.q_a_proj = nn.Linear(self.hidden_size, config.q_lora_rank, bias=config.attention_bias)
+            self.q_a_layernorm = DeepseekV2RMSNorm(config.q_lora_rank)
+            self.q_b_proj = nn.Linear(config.q_lora_rank, self.num_heads * self.qk_head_dim, bias=False)
+
+        self.kv_a_proj_with_mqa = nn.Linear(
+            self.hidden_size,
+            config.kv_lora_rank + config.qk_rope_head_dim,
+            bias=config.attention_bias,
+        )
+        self.kv_a_layernorm = DeepseekV2RMSNorm(config.kv_lora_rank)
+        self.kv_b_proj = nn.Linear(
+            config.kv_lora_rank,
+            self.num_heads * (self.qk_head_dim - self.qk_rope_head_dim + self.v_head_dim),
+            bias=False,
+        )
+
+        self.o_proj = nn.Linear(
+            self.num_heads * self.v_head_dim,
+            self.hidden_size,
+            bias=config.attention_bias,
+        )
+
+        self.scaling = self.qk_head_dim ** (-0.5)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        batch_size, seq_length = hidden_states.shape[:-1]
+        query_shape = (batch_size, seq_length, -1, self.qk_head_dim)
+        key_shape = (batch_size, seq_length, -1, self.qk_nope_head_dim + self.v_head_dim)
+
+        if self.q_lora_rank is None:
+            q = self.q_proj(hidden_states)
+        else:
+            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
+        q = q.view(query_shape).transpose(1, 2)
+        q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+
+        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+        k_nope, k_pe = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        k_nope = self.kv_b_proj(self.kv_a_layernorm(k_nope)).view(key_shape).transpose(1, 2)
+        k_nope, value_states = torch.split(k_nope, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+        k_pe = k_pe.view(batch_size, 1, seq_length, self.qk_rope_head_dim)
+        q_pe, k_pe = apply_rotary_emb(q_pe, k_pe, position_embeddings.to(q_pe.device))
+
+        k_pe = k_pe.expand(*k_nope.shape[:-1], -1)
+        query_states = torch.cat((q_nope, q_pe), dim=-1)
+        key_states = torch.cat((k_nope, k_pe), dim=-1)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim:
+            value_states = F.pad(value_states, [0, self.qk_head_dim - self.v_head_dim])
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim:
+            attn_output = attn_output[:, :, :, : self.v_head_dim]
+
+        attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class DeepseekV2DecoderLayer(LlamaDecoderLayer):
+    def __init__(self, config: DeepseekV2Config, layer_idx: int):
+        super().__init__(config, layer_idx)
+
+        self.self_attn = DeepseekV2Attention(config=config, layer_idx=layer_idx)
+        self.mlp = DeepseekV2MoE(config) if layer_idx >= config.first_k_dense_replace else DeepseekV2MLP(config)
+
+        self.input_layernorm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+
+class DeepseekV2PreTrainedModel(LlamaPreTrainedModel):
+    _can_compile_fullgraph = False
+
+    def _init_weights(self, module):
+        PreTrainedModel._init_weights(self, module)
+        if isinstance(module, DeepseekV2MoEGate):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+
+class DeepseekV2Model(LlamaModel):
+    pass
+
+
+class DeepseekV2ForCausalLM(LlamaForCausalLM):
+    pass
+
+
+class DeepseekV2ForSequenceClassification(LlamaForSequenceClassification):
+    pass
+
+
+__all__ = [
+    "DeepseekV2PreTrainedModel",
+    "DeepseekV2Model",
+    "DeepseekV2ForCausalLM",
+    "DeepseekV2ForSequenceClassification",
+    "DeepseekV2Config",
+]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/dinov2/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/dinov2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cc316957eac509573bf44785209d0729ea13bb6
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/dinov2/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_dinov2 import *
+    from .modeling_dinov2 import *
+    from .modeling_flax_dinov2 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/dinov2/configuration_dinov2.py b/venv/lib/python3.13/site-packages/transformers/models/dinov2/configuration_dinov2.py
new file mode 100644
index 0000000000000000000000000000000000000000..55fa0539a23bf0c02d0079ce41f0c5228b88c904
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/dinov2/configuration_dinov2.py
@@ -0,0 +1,179 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DINOv2 model configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+logger = logging.get_logger(__name__)
+
+
+class Dinov2Config(BackboneConfigMixin, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Dinov2Model`]. It is used to instantiate an
+    Dinov2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Dinov2
+    [google/dinov2-base-patch16-224](https://huggingface.co/google/dinov2-base-patch16-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        mlp_ratio (`int`, *optional*, defaults to 4):
+            Ratio of the hidden size of the MLPs relative to the `hidden_size`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        layerscale_value (`float`, *optional*, defaults to 1.0):
+           Initial value to use for layer scale.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Stochastic depth rate per sample (when applied in the main path of residual layers).
+        use_swiglu_ffn (`bool`, *optional*, defaults to `False`):
+            Whether to use the SwiGLU feedforward neural network.
+        out_features (`list[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        out_indices (`list[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        apply_layernorm (`bool`, *optional*, defaults to `True`):
+            Whether to apply layer normalization to the feature maps in case the model is used as backbone.
+        reshape_hidden_states (`bool`, *optional*, defaults to `True`):
+            Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in
+            case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
+            seq_len, hidden_size)`.
+        use_mask_token (`bool`, *optional*, defaults to `True`):
+            Whether to use mask_token in embeddings.
+
+    Example:
+
+    ```python
+    >>> from transformers import Dinov2Config, Dinov2Model
+
+    >>> # Initializing a Dinov2 dinov2-base-patch16-224 style configuration
+    >>> configuration = Dinov2Config()
+
+    >>> # Initializing a model (with random weights) from the dinov2-base-patch16-224 style configuration
+    >>> model = Dinov2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "dinov2"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        mlp_ratio=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-6,
+        image_size=224,
+        patch_size=14,
+        num_channels=3,
+        qkv_bias=True,
+        layerscale_value=1.0,
+        drop_path_rate=0.0,
+        use_swiglu_ffn=False,
+        out_features=None,
+        out_indices=None,
+        apply_layernorm=True,
+        reshape_hidden_states=True,
+        use_mask_token=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.mlp_ratio = mlp_ratio
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.layerscale_value = layerscale_value
+        self.drop_path_rate = drop_path_rate
+        self.use_swiglu_ffn = use_swiglu_ffn
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+        self.apply_layernorm = apply_layernorm
+        self.reshape_hidden_states = reshape_hidden_states
+        self.use_mask_token = use_mask_token
+
+
+class Dinov2OnnxConfig(OnnxConfig):
+    torch_onnx_minimum_version = version.parse("1.11")
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
+
+
+__all__ = ["Dinov2Config", "Dinov2OnnxConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/dinov2/modeling_dinov2.py b/venv/lib/python3.13/site-packages/transformers/models/dinov2/modeling_dinov2.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d76b140ff091cb8b82db36b4700db350e7f965d
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/dinov2/modeling_dinov2.py
@@ -0,0 +1,684 @@
+# coding=utf-8
+# Copyright 2023 Meta AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch DINOv2 model."""
+
+import collections.abc
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BackboneOutput, BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import TransformersKwargs, auto_docstring, logging, torch_int
+from ...utils.backbone_utils import BackboneMixin
+from ...utils.generic import can_return_tuple, check_model_inputs
+from .configuration_dinov2 import Dinov2Config
+
+
+logger = logging.get_logger(__name__)
+
+
+class Dinov2Embeddings(nn.Module):
+    """
+    Construct the CLS token, mask token, position and patch embeddings.
+    """
+
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+
+        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        if config.use_mask_token:
+            self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size))
+        self.patch_embeddings = Dinov2PatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size))
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.patch_size = config.patch_size
+        self.use_mask_token = config.use_mask_token
+        self.config = config
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing and interpolation at torch.float32 precision.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embeddings
+
+        class_pos_embed = self.position_embeddings[:, :1]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+        target_dtype = patch_pos_embed.dtype
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.to(torch.float32),
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        ).to(dtype=target_dtype)
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+    def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        target_dtype = self.patch_embeddings.projection.weight.dtype
+        embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
+
+        if bool_masked_pos is not None and self.use_mask_token:
+            embeddings = torch.where(
+                bool_masked_pos.unsqueeze(-1), self.mask_token.to(embeddings.dtype).unsqueeze(0), embeddings
+            )
+
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        # add positional encoding to each token
+        embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class Dinov2PatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        num_channels = pixel_values.shape[1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+                f" Expected {self.num_channels} but got {num_channels}."
+            )
+        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return embeddings
+
+
+# Copied from transformers.models.vit.modeling_vit.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+
+    # Normalize the attention scores to probabilities.
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    # Mask heads if we want to
+    if attention_mask is not None:
+        attn_weights = attn_weights * attention_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->Dinov2
+class Dinov2SelfAttention(nn.Module):
+    def __init__(self, config: Dinov2Config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.config = config
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.scaling = self.attention_head_size**-0.5
+        self.is_causal = False
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+    def forward(
+        self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        batch_size = hidden_states.shape[0]
+        new_shape = batch_size, -1, self.num_attention_heads, self.attention_head_size
+
+        key_layer = self.key(hidden_states).view(*new_shape).transpose(1, 2)
+        value_layer = self.value(hidden_states).view(*new_shape).transpose(1, 2)
+        query_layer = self.query(hidden_states).view(*new_shape).transpose(1, 2)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        context_layer, attention_probs = attention_interface(
+            self,
+            query_layer,
+            key_layer,
+            value_layer,
+            head_mask,
+            is_causal=self.is_causal,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.dropout_prob,
+        )
+
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        return context_layer, attention_probs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->Dinov2
+class Dinov2SelfOutput(nn.Module):
+    """
+    The residual connection is defined in Dinov2Layer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: Dinov2Config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->Dinov2
+class Dinov2Attention(nn.Module):
+    def __init__(self, config: Dinov2Config):
+        super().__init__()
+        self.attention = Dinov2SelfAttention(config)
+        self.output = Dinov2SelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads: set[int]):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        self_attn_output, _ = self.attention(hidden_states, head_mask)
+        output = self.output(self_attn_output, hidden_states)
+        return output
+
+
+class Dinov2LayerScale(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.lambda1 = nn.Parameter(config.layerscale_value * torch.ones(config.hidden_size))
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        return hidden_state * self.lambda1
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath
+class Dinov2DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return f"p={self.drop_prob}"
+
+
+class Dinov2MLP(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=True)
+        if isinstance(config.hidden_act, str):
+            self.activation = ACT2FN[config.hidden_act]
+        else:
+            self.activation = config.hidden_act
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=True)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.fc1(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.fc2(hidden_state)
+        return hidden_state
+
+
+class Dinov2SwiGLUFFN(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+
+        self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True)
+        self.weights_out = nn.Linear(hidden_features, out_features, bias=True)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.weights_in(hidden_state)
+        x1, x2 = hidden_state.chunk(2, dim=-1)
+        hidden = nn.functional.silu(x1) * x2
+        return self.weights_out(hidden)
+
+
+class Dinov2Layer(GradientCheckpointingLayer):
+    """This corresponds to the Block class in the original implementation."""
+
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+
+        self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attention = Dinov2Attention(config)
+        self.layer_scale1 = Dinov2LayerScale(config)
+        self.drop_path = Dinov2DropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+
+        self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        if config.use_swiglu_ffn:
+            self.mlp = Dinov2SwiGLUFFN(config)
+        else:
+            self.mlp = Dinov2MLP(config)
+        self.layer_scale2 = Dinov2LayerScale(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states_norm = self.norm1(hidden_states)
+        self_attention_output = self.attention(hidden_states_norm, head_mask)
+        self_attention_output = self.layer_scale1(self_attention_output)
+
+        # first residual connection
+        hidden_states = self.drop_path(self_attention_output) + hidden_states
+
+        # in Dinov2, layernorm is also applied after self-attention
+        layer_output = self.norm2(hidden_states)
+        layer_output = self.mlp(layer_output)
+        layer_output = self.layer_scale2(layer_output)
+
+        # second residual connection
+        layer_output = self.drop_path(layer_output) + hidden_states
+
+        return layer_output
+
+
+class Dinov2Encoder(nn.Module):
+    def __init__(self, config: Dinov2Config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([Dinov2Layer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None, output_hidden_states: bool = False
+    ) -> BaseModelOutput:
+        all_hidden_states = [hidden_states] if output_hidden_states else None
+        for i, layer_module in enumerate(self.layer):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            hidden_states = layer_module(hidden_states, layer_head_mask)
+            if all_hidden_states:
+                all_hidden_states.append(hidden_states)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=tuple(all_hidden_states) if all_hidden_states else None,
+        )
+
+
+@auto_docstring
+class Dinov2PreTrainedModel(PreTrainedModel):
+    config: Dinov2Config
+    base_model_prefix = "dinov2"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Dinov2Layer"]
+    _supports_sdpa = True
+    _supports_flash_attn = True
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "attentions": Dinov2SelfAttention,
+    }
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, Dinov2Embeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.position_embeddings.dtype)
+
+            module.cls_token.data = nn.init.trunc_normal_(
+                module.cls_token.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.cls_token.dtype)
+
+            if self.config.use_mask_token:
+                module.mask_token.data.zero_()
+        elif isinstance(module, Dinov2LayerScale):
+            module.lambda1.data.fill_(self.config.layerscale_value)
+
+
+@auto_docstring
+class Dinov2Model(Dinov2PreTrainedModel):
+    def __init__(self, config: Dinov2Config):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = Dinov2Embeddings(config)
+        self.encoder = Dinov2Encoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> Dinov2PatchEmbeddings:
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune: dict[int, list[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @check_model_inputs(tie_last_hidden_states=False)
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        **kwargs,
+    ) -> BaseModelOutputWithPooling:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for
+            pre-training.
+        """
+        if output_hidden_states is None:
+            output_hidden_states = self.config.output_hidden_states
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
+
+        encoder_outputs: BaseModelOutput = self.encoder(
+            embedding_output, head_mask=head_mask, output_hidden_states=output_hidden_states
+        )
+        sequence_output = encoder_outputs.last_hidden_state
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = sequence_output[:, 0, :]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state
+    of the [CLS] token) e.g. for ImageNet.
+    """
+)
+class Dinov2ForImageClassification(Dinov2PreTrainedModel):
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.dinov2 = Dinov2Model(config)
+
+        # Classifier head
+        self.classifier = (
+            nn.Linear(config.hidden_size * 2, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> ImageClassifierOutput:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        outputs: BaseModelOutputWithPooling = self.dinov2(pixel_values, head_mask=head_mask, **kwargs)
+
+        sequence_output = outputs.last_hidden_state  # batch_size, sequence_length, hidden_size
+        cls_token = sequence_output[:, 0]
+        patch_tokens = sequence_output[:, 1:]
+
+        linear_input = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1)
+        logits = self.classifier(linear_input)
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(labels, logits, self.config, **kwargs)
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Dinov2 backbone, to be used with frameworks like DETR and MaskFormer.
+    """
+)
+class Dinov2Backbone(Dinov2PreTrainedModel, BackboneMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        super()._init_backbone(config)
+
+        self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)]
+        self.embeddings = Dinov2Embeddings(config)
+        self.encoder = Dinov2Encoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> Dinov2PatchEmbeddings:
+        return self.embeddings.patch_embeddings
+
+    @check_model_inputs()
+    @auto_docstring
+    def forward(
+        self, pixel_values: torch.Tensor, output_hidden_states: Optional[bool] = None, **kwargs
+    ) -> BackboneOutput:
+        r"""
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
+        >>> model = AutoBackbone.from_pretrained(
+        ...     "facebook/dinov2-base", out_features=["stage2", "stage5", "stage8", "stage11"]
+        ... )
+
+        >>> inputs = processor(image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> feature_maps = outputs.feature_maps
+        >>> list(feature_maps[-1].shape)
+        [1, 768, 16, 16]
+        ```"""
+        if output_hidden_states is None:
+            output_hidden_states = self.config.output_hidden_states
+
+        embedding_output = self.embeddings(pixel_values)
+        output: BaseModelOutput = self.encoder(embedding_output, output_hidden_states=True)
+        hidden_states = output.hidden_states
+
+        feature_maps = []
+        for stage, hidden_state in zip(self.stage_names, hidden_states):
+            if stage in self.out_features:
+                if self.config.apply_layernorm:
+                    hidden_state = self.layernorm(hidden_state)
+                if self.config.reshape_hidden_states:
+                    hidden_state = hidden_state[:, 1:]
+                    # this was actually a bug in the original implementation that we copied here,
+                    # cause normally the order is height, width
+                    batch_size, _, height, width = pixel_values.shape
+                    patch_size = self.config.patch_size
+                    hidden_state = hidden_state.reshape(batch_size, height // patch_size, width // patch_size, -1)
+                    hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
+                feature_maps.append(hidden_state)
+
+        return BackboneOutput(
+            feature_maps=tuple(feature_maps),
+            hidden_states=hidden_states if output_hidden_states else None,
+        )
+
+
+__all__ = ["Dinov2ForImageClassification", "Dinov2Model", "Dinov2PreTrainedModel", "Dinov2Backbone"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/dinov2/modeling_flax_dinov2.py b/venv/lib/python3.13/site-packages/transformers/models/dinov2/modeling_flax_dinov2.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9ea2eaa3ebc5ac5db192dde220dfe114ef38235
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/dinov2/modeling_flax_dinov2.py
@@ -0,0 +1,801 @@
+# coding=utf-8
+# Copyright 2023 Meta AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Flax DINOv2 model."""
+
+import collections.abc
+import math
+from typing import Optional
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen.attention import dot_product_attention_weights
+from flax.traverse_util import flatten_dict, unflatten_dict
+
+from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxBaseModelOutputWithPooling, FlaxSequenceClassifierOutput
+from ...modeling_flax_utils import (
+    ACT2FN,
+    FlaxPreTrainedModel,
+    append_replace_return_docstrings,
+    overwrite_call_docstring,
+)
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward
+from .configuration_dinov2 import Dinov2Config
+
+
+DINOV2_START_DOCSTRING = r"""
+
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)
+
+    This model is also a
+    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
+    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
+    behavior.
+
+    Finally, this model supports inherent JAX features such as:
+
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        config ([`Dinov2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs).
+
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
+"""
+
+DINOV2_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`Dinov2ImageProcessor.__call__`]
+            for details.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class FlaxDinov2PatchEmbeddings(nn.Module):
+    config: Dinov2Config
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        image_size = self.config.image_size
+        patch_size = self.config.patch_size
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+
+        self.num_patches = num_patches
+        self.num_channels = self.config.num_channels
+        self.projection = nn.Conv(
+            self.config.hidden_size,
+            kernel_size=patch_size,
+            strides=patch_size,
+            padding="VALID",
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                self.config.initializer_range**2, "fan_in", "truncated_normal"
+            ),
+        )
+
+    # Copied from transformers.models.vit.modeling_flax_vit.FlaxViTPatchEmbeddings.__call__
+    def __call__(self, pixel_values):
+        num_channels = pixel_values.shape[-1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        embeddings = self.projection(pixel_values)
+        batch_size, _, _, channels = embeddings.shape
+        return jnp.reshape(embeddings, (batch_size, -1, channels))
+
+
+class FlaxDinov2Embeddings(nn.Module):
+    """Construct the CLS token, position and patch embeddings."""
+
+    config: Dinov2Config
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.cls_token = self.param(
+            "cls_token",
+            jax.nn.initializers.variance_scaling(self.config.initializer_range**2, "fan_in", "truncated_normal"),
+            (1, 1, self.config.hidden_size),
+        )
+        if self.config.use_mask_token:
+            self.mask_token = self.param(
+                "mask_token",
+                jax.nn.initializers.variance_scaling(self.config.initializer_range**2, "fan_in", "truncated_normal"),
+                (1, self.config.hidden_size),
+            )
+        self.patch_embeddings = FlaxDinov2PatchEmbeddings(self.config, dtype=self.dtype)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = self.param(
+            "position_embeddings",
+            jax.nn.initializers.variance_scaling(self.config.initializer_range**2, "fan_in", "truncated_normal"),
+            (1, num_patches + 1, self.config.hidden_size),
+        )
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+
+    def interpolate_pos_encoding(self, config, hidden_states, height, width, position_embeddings):
+        num_patches = hidden_states.shape[1] - 1
+        num_positions = position_embeddings.shape[1] - 1
+        if num_patches == num_positions and height == width:
+            return position_embeddings
+        class_pos_embed = position_embeddings[:, 0]
+        patch_pos_embed = position_embeddings[:, 1:]
+        dim = hidden_states.shape[-1]
+
+        h = height // config.patch_size
+        w = width // config.patch_size
+        height, width = h + 0.1, w + 0.1
+
+        patch_pos_embed = patch_pos_embed.reshape(
+            (1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim)
+        )
+        patch_pos_embed = jnp.transpose(patch_pos_embed, (0, 3, 1, 2))
+        target_dtype = patch_pos_embed.dtype
+        new_height_ratio = jnp.float32(height / math.sqrt(num_positions))
+        new_width_ratio = jnp.float32(width / math.sqrt(num_positions))
+
+        scale = jnp.array([new_height_ratio, new_width_ratio], dtype=jnp.float32)
+        translation = jnp.array([0.0, 0.0], dtype=jnp.float32)
+
+        patch_pos_embed = jax.image.scale_and_translate(
+            patch_pos_embed.astype(jnp.float32),
+            shape=(patch_pos_embed.shape[0], patch_pos_embed.shape[1], h, w),
+            spatial_dims=(2, 3),
+            scale=scale,
+            translation=translation,
+            method="bicubic",
+            antialias=False,
+        )
+        patch_pos_embed = patch_pos_embed.astype(target_dtype)
+        patch_pos_embed = jnp.transpose(patch_pos_embed, (0, 2, 3, 1)).reshape((position_embeddings.shape[0], -1, dim))
+        patch_pos_embed_expanded = jnp.tile(patch_pos_embed, (hidden_states.shape[0], 1, 1))
+        class_pos_embed_expanded = jnp.tile(class_pos_embed, (hidden_states.shape[0], 1, 1))
+
+        return jnp.concatenate((class_pos_embed_expanded, patch_pos_embed_expanded), axis=1)
+
+    def __call__(self, pixel_values, deterministic=True):
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embeddings.projection.dtype
+        height, width = pixel_values.shape[1], pixel_values.shape[2]
+
+        embeddings = self.patch_embeddings(pixel_values.astype(target_dtype))
+
+        cls_tokens = jnp.broadcast_to(self.cls_token, (batch_size, 1, self.config.hidden_size))
+        embeddings = jnp.concatenate((cls_tokens, embeddings), axis=1)
+
+        embeddings = embeddings + self.interpolate_pos_encoding(
+            self.config, embeddings, height, width, self.position_embeddings
+        )
+
+        embeddings = self.dropout(embeddings, deterministic=deterministic)
+        return embeddings
+
+
+# Copied from transformers.models.vit.modeling_flax_vit.FlaxViTSelfAttention with ViT->Dinov2
+class FlaxDinov2SelfAttention(nn.Module):
+    config: Dinov2Config
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        if self.config.hidden_size % self.config.num_attention_heads != 0:
+            raise ValueError(
+                "`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads`:"
+                " {self.config.num_attention_heads}"
+            )
+
+        self.query = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                self.config.initializer_range**2, mode="fan_in", distribution="truncated_normal"
+            ),
+            use_bias=self.config.qkv_bias,
+        )
+        self.key = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                self.config.initializer_range**2, mode="fan_in", distribution="truncated_normal"
+            ),
+            use_bias=self.config.qkv_bias,
+        )
+        self.value = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                self.config.initializer_range**2, mode="fan_in", distribution="truncated_normal"
+            ),
+            use_bias=self.config.qkv_bias,
+        )
+
+    def __call__(self, hidden_states, deterministic: bool = True, output_attentions: bool = False):
+        head_dim = self.config.hidden_size // self.config.num_attention_heads
+
+        query_states = self.query(hidden_states).reshape(
+            hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim)
+        )
+        value_states = self.value(hidden_states).reshape(
+            hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim)
+        )
+        key_states = self.key(hidden_states).reshape(
+            hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim)
+        )
+
+        dropout_rng = None
+        if not deterministic and self.config.attention_probs_dropout_prob > 0.0:
+            dropout_rng = self.make_rng("dropout")
+
+        attn_weights = dot_product_attention_weights(
+            query_states,
+            key_states,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.config.attention_probs_dropout_prob,
+            broadcast_dropout=True,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            precision=None,
+        )
+
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
+        attn_output = attn_output.reshape(attn_output.shape[:2] + (-1,))
+
+        outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_flax_vit.FlaxViTSelfOutput with ViT->Dinov2
+class FlaxDinov2SelfOutput(nn.Module):
+    config: Dinov2Config
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                self.config.initializer_range**2, "fan_in", "truncated_normal"
+            ),
+            dtype=self.dtype,
+        )
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+
+    def __call__(self, hidden_states, input_tensor, deterministic: bool = True):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_flax_vit.FlaxViTAttention with ViT->Dinov2
+class FlaxDinov2Attention(nn.Module):
+    config: Dinov2Config
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.attention = FlaxDinov2SelfAttention(self.config, dtype=self.dtype)
+        self.output = FlaxDinov2SelfOutput(self.config, dtype=self.dtype)
+
+    def __call__(self, hidden_states, deterministic=True, output_attentions: bool = False):
+        attn_outputs = self.attention(hidden_states, deterministic=deterministic, output_attentions=output_attentions)
+        attn_output = attn_outputs[0]
+        hidden_states = self.output(attn_output, hidden_states, deterministic=deterministic)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_outputs[1],)
+
+        return outputs
+
+
+def ones_with_scale(key, shape, scale, dtype=jnp.float32):
+    return jnp.ones(shape, dtype) * scale
+
+
+class FlaxDinov2LayerScale(nn.Module):
+    config: Dinov2Config
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.lambda1 = self.config.layerscale_value * self.param(
+            "lambda1",
+            jax.nn.initializers.ones,
+            (self.config.hidden_size,),
+        )
+        self.lambda1 = self.lambda1 * self.config.layerscale_value
+
+    def __call__(self, hidden_states):
+        return self.lambda1 * hidden_states
+
+
+# Copied from transformers.models.beit.modeling_flax_beit.FlaxBeitDropPath with Beit -> Dinov2
+class FlaxDinov2DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    rate: float
+
+    @nn.module.compact
+    def __call__(self, inputs, deterministic: Optional[bool] = True):
+        if self.rate == 0.0:
+            return inputs
+        keep_prob = 1.0 - self.rate
+        if deterministic:
+            return inputs
+        else:
+            shape = (inputs.shape[0],) + (1,) * (inputs.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+            rng = self.make_rng("droppath")
+            random_tensor = keep_prob + jax.random.uniform(rng, shape=shape, dtype=inputs.dtype)
+            binary_tensor = jnp.floor(random_tensor)
+            output = inputs / keep_prob * binary_tensor
+            return output
+
+
+class FlaxDinov2MLP(nn.Module):
+    config: Dinov2Config
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.fc1 = nn.Dense(
+            self.config.hidden_size * self.config.mlp_ratio,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                self.config.initializer_range**2, "fan_in", "truncated_normal"
+            ),
+            dtype=self.dtype,
+        )
+        self.fc2 = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                self.config.initializer_range**2, "fan_in", "truncated_normal"
+            ),
+            dtype=self.dtype,
+        )
+        if isinstance(self.config.hidden_act, str):
+            self.act = ACT2FN[self.config.hidden_act]
+        else:
+            self.act = self.config.hidden_act
+
+    def __call__(self, hidden_states):
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class FlaxDinov2SwiGLUFFN(nn.Module):
+    config: Dinov2Config
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        hidden_features = int(self.config.hidden_size * self.config.mlp_ratio)
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+
+        self.weights_in = nn.Dense(
+            2 * hidden_features,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                self.config.initializer_range**2, "fan_in", "truncated_normal"
+            ),
+            dtype=self.dtype,
+        )
+        self.weights_out = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                self.config.initializer_range**2, "fan_in", "truncated_normal"
+            ),
+            dtype=self.dtype,
+        )
+
+    def __call__(self, hidden_states):
+        hidden_states = self.weights_in(hidden_states)
+        x1, x2 = jnp.split(hidden_states, 2, axis=-1)
+        hidden = nn.silu(x1) * x2
+        return self.weights_out(hidden)
+
+
+class FlaxDinov2Layer(nn.Module):
+    config: Dinov2Config
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.norm1 = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.attention = FlaxDinov2Attention(self.config, dtype=self.dtype)
+        self.layer_scale1 = FlaxDinov2LayerScale(self.config, dtype=self.dtype)
+        self.drop_path = FlaxDinov2DropPath(self.config.drop_path_rate)
+        self.norm2 = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+
+        if self.config.use_swiglu_ffn:
+            self.mlp = FlaxDinov2SwiGLUFFN(self.config, dtype=self.dtype)
+        else:
+            self.mlp = FlaxDinov2MLP(self.config, dtype=self.dtype)
+
+        self.layer_scale2 = FlaxDinov2LayerScale(self.config, dtype=self.dtype)
+
+    def __call__(self, hidden_states, deterministic: bool = True, output_attentions: bool = False):
+        self_attention_outputs = self.attention(
+            self.norm1(hidden_states),  # in Dinov2, layernorm is applied before self-attention
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+        )
+
+        attention_output = self_attention_outputs[0]
+
+        attention_output = self.layer_scale1(attention_output)
+
+        outputs = self_attention_outputs[1:]
+
+        # first residual connection
+        hidden_states = self.drop_path(attention_output) + hidden_states
+
+        # in Dinov2, layernorm is also applied after self-attention
+        layer_output = self.norm2(hidden_states)
+        layer_output = self.mlp(layer_output)
+        layer_output = self.layer_scale2(layer_output)
+
+        # second residual connection
+        layer_output = self.drop_path(layer_output) + hidden_states
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_flax_vit.FlaxViTLayerCollection with ViT->Dinov2
+class FlaxDinov2LayerCollection(nn.Module):
+    config: Dinov2Config
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.layers = [
+            FlaxDinov2Layer(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.num_hidden_layers)
+        ]
+
+    def __call__(
+        self,
+        hidden_states,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = layer(hidden_states, deterministic=deterministic, output_attentions=output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions += (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        outputs = (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
+        )
+
+
+# Copied from transformers.models.vit.modeling_flax_vit.FlaxViTEncoder with ViT->Dinov2
+class FlaxDinov2Encoder(nn.Module):
+    config: Dinov2Config
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.layer = FlaxDinov2LayerCollection(self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_states,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        return self.layer(
+            hidden_states,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class FlaxDinov2PreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = Dinov2Config
+    base_model_prefix = "dinov2"
+    main_input_name = "pixel_values"
+    module_class: nn.Module = None
+
+    def __init__(
+        self,
+        config: Dinov2Config,
+        input_shape=None,
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        if input_shape is None:
+            input_shape = (1, config.image_size, config.image_size, config.num_channels)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        pixel_values = jnp.zeros(input_shape, dtype=self.dtype)
+
+        params_rng, dropout_rng = jax.random.split(rng)
+        dropout_rng, droppath_rng = jax.random.split(dropout_rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng, "droppath": droppath_rng}
+
+        random_params = self.module.init(rngs, pixel_values, return_dict=False)["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    @add_start_docstrings_to_model_forward(DINOV2_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def __call__(
+        self,
+        pixel_values,
+        params: Optional[dict] = None,
+        dropout_rng: jax.random.PRNGKey = None,
+        train: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        pixel_values = jnp.transpose(pixel_values, (0, 2, 3, 1))
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            dropout_rng, droppath_rng = jax.random.split(dropout_rng)
+            rngs["dropout"] = dropout_rng
+            rngs["droppath"] = droppath_rng
+
+        return self.module.apply(
+            {"params": params or self.params},
+            jnp.array(pixel_values, dtype=jnp.float32),
+            not train,
+            output_attentions,
+            output_hidden_states,
+            return_dict,
+            rngs=rngs,
+        )
+
+
+class FlaxDinov2Module(nn.Module):
+    config: Dinov2Config
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.embeddings = FlaxDinov2Embeddings(self.config, dtype=self.dtype)
+        self.encoder = FlaxDinov2Encoder(self.config, dtype=self.dtype)
+        self.layernorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+
+    def __call__(
+        self,
+        pixel_values,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        hidden_states = self.embeddings(pixel_values, deterministic=deterministic)
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = sequence_output[:, 0, :]
+
+        if not return_dict:
+            head_outputs = (sequence_output, pooled_output)
+            return head_outputs + encoder_outputs[1:]
+
+        return FlaxBaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    "The bare Dinov2 Model transformer outputting raw hidden-states without any specific head on top.",
+    DINOV2_START_DOCSTRING,
+)
+class FlaxDinov2Model(FlaxDinov2PreTrainedModel):
+    module_class = FlaxDinov2Module
+
+
+FLAX_VISION_MODEL_DOCSTRING = """
+    Returns:
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoImageProcessor, FlaxDinov2Model
+    >>> from PIL import Image
+    >>> import requests
+
+    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    >>> image = Image.open(requests.get(url, stream=True).raw)
+
+    >>> image_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
+    >>> model = FlaxDinov2Model.from_pretrained("facebook/dinov2-base")
+
+    >>> inputs = image_processor(images=image, return_tensors="np")
+    >>> outputs = model(**inputs)
+    >>> last_hidden_states = outputs.last_hidden_state
+    ```
+"""
+
+overwrite_call_docstring(FlaxDinov2Model, FLAX_VISION_MODEL_DOCSTRING)
+append_replace_return_docstrings(
+    FlaxDinov2Model, output_type=FlaxBaseModelOutputWithPooling, config_class=Dinov2Config
+)
+
+
+class FlaxDinov2ForImageClassificationModule(nn.Module):
+    config: Dinov2Config
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.dinov2 = FlaxDinov2Module(config=self.config, dtype=self.dtype)
+        self.classifier = nn.Dense(
+            self.config.num_labels,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                self.config.initializer_range**2, "fan_in", "truncated_normal"
+            ),
+        )
+
+    def __call__(
+        self,
+        pixel_values=None,
+        deterministic: bool = True,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.dinov2(
+            pixel_values,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+
+        cls_token = hidden_states[:, 0]
+        patch_tokens = hidden_states[:, 1:]
+        linear_input = jnp.concatenate([cls_token, patch_tokens.mean(axis=1)], axis=-1)
+
+        logits = self.classifier(linear_input)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return output
+
+        return FlaxSequenceClassifierOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
+    the [CLS] token) e.g. for ImageNet.
+    """,
+    DINOV2_START_DOCSTRING,
+)
+class FlaxDinov2ForImageClassification(FlaxDinov2PreTrainedModel):
+    module_class = FlaxDinov2ForImageClassificationModule
+
+
+FLAX_VISION_CLASSIFICATION_DOCSTRING = """
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoImageProcessor, FlaxDinov2ForImageClassification
+    >>> from PIL import Image
+    >>> import jax
+    >>> import requests
+
+    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    >>> image = Image.open(requests.get(url, stream=True).raw)
+
+    >>> image_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base-imagenet1k-1-layer")
+    >>> model = FlaxDinov2ForImageClassification.from_pretrained("facebook/dinov2-base-imagenet1k-1-layer", from_pt=True)
+
+    >>> inputs = image_processor(images=image, return_tensors="np")
+    >>> outputs = model(**inputs)
+    >>> logits = outputs.logits
+
+    >>> # model predicts one of the 1000 ImageNet classes
+    >>> predicted_class_idx = jax.numpy.argmax(logits, axis=-1)
+    >>> print("Predicted class:", model.config.id2label[predicted_class_idx.item()])
+    ```
+"""
+
+overwrite_call_docstring(FlaxDinov2ForImageClassification, FLAX_VISION_CLASSIFICATION_DOCSTRING)
+append_replace_return_docstrings(
+    FlaxDinov2ForImageClassification, output_type=FlaxSequenceClassifierOutput, config_class=Dinov2Config
+)
+
+
+__all__ = ["FlaxDinov2ForImageClassification", "FlaxDinov2Model", "FlaxDinov2PreTrainedModel"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/dinov3_convnext/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/dinov3_convnext/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8839dc7cec78158801a1b623c27dd46281aadb0d
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/dinov3_convnext/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_dinov3_convnext import *
+    from .modeling_dinov3_convnext import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/dinov3_convnext/configuration_dinov3_convnext.py b/venv/lib/python3.13/site-packages/transformers/models/dinov3_convnext/configuration_dinov3_convnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa593e10ec1a685091234ee7966873570716276e
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/dinov3_convnext/configuration_dinov3_convnext.py
@@ -0,0 +1,103 @@
+# coding=utf-8
+# Copyright 2025 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ConvNeXT model configuration"""
+
+from typing import Optional
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class DINOv3ConvNextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DINOv3ConvNextModel`]. It is used to instantiate an
+    DINOv3ConvNext model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the DINOv3ConvNext
+    [facebook/dinov3-convnext-tiny-pretrain-lvd1689m](https://huggingface.co/facebook/dinov3-convnext-tiny-pretrain-lvd1689m) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        hidden_sizes (`list[int]`, *optional*, defaults to [96, 192, 384, 768]):
+            Dimensionality (hidden size) at each stage.
+        depths (`list[int]`, *optional*, defaults to [3, 3, 9, 3]):
+            The number of layers for each stage.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in each block. If string, `"gelu"`, `"relu"`,
+            `"selu"` and `"gelu_new"` are supported.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        layer_scale_init_value (`float`, *optional*, defaults to 1e-06):
+            The initial value for the layer scale.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            The drop rate for stochastic depth.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of input images.
+
+    Example:
+    ```python
+    >>> from transformers import DINOv3ConvNextConfig, DINOv3ConvNextModel
+
+    >>> # Initializing a DINOv3ConvNext (tiny variant) style configuration
+    >>> config = DINOv3ConvNextConfig()
+
+    >>> # Initializing a model (with random weights)
+    >>> model = DINOv3ConvNextModel(config)
+
+    >>> # Accessing the model config
+    >>> config = model.config
+    ```"""
+
+    model_type = "dinov3_convnext"
+
+    def __init__(
+        self,
+        num_channels: int = 3,
+        hidden_sizes: Optional[list[int]] = None,
+        depths: Optional[list[int]] = None,
+        hidden_act: str = "gelu",
+        initializer_range: float = 0.02,
+        layer_norm_eps: float = 1e-6,
+        layer_scale_init_value: float = 1e-6,
+        drop_path_rate: float = 0.0,
+        image_size: int = 224,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.num_channels = num_channels
+        self.hidden_sizes = [96, 192, 384, 768] if hidden_sizes is None else hidden_sizes
+        self.depths = [3, 3, 9, 3] if depths is None else depths
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.layer_scale_init_value = layer_scale_init_value
+        self.drop_path_rate = drop_path_rate
+        self.image_size = image_size
+
+    @property
+    def num_stages(self) -> int:
+        return len(self.hidden_sizes)
+
+
+__all__ = ["DINOv3ConvNextConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/dinov3_convnext/modeling_dinov3_convnext.py b/venv/lib/python3.13/site-packages/transformers/models/dinov3_convnext/modeling_dinov3_convnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..df2ef491192c297fae0cad62ce5f6c626b48b51c
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/dinov3_convnext/modeling_dinov3_convnext.py
@@ -0,0 +1,260 @@
+# coding=utf-8
+# Copyright 2025 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch ConvNext model."""
+
+from typing import Optional
+
+import numpy as np
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutputWithPoolingAndNoAttention,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import auto_docstring, logging
+from ...utils.generic import can_return_tuple
+from .configuration_dinov3_convnext import DINOv3ConvNextConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.convnext.modeling_convnext.ConvNextDropPath with ConvNext->DINOv3ConvNext
+class DINOv3ConvNextDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return f"p={self.drop_prob}"
+
+
+class DINOv3ConvNextLayerNorm(nn.LayerNorm):
+    r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
+    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
+    """
+
+    def __init__(self, *args, data_format="channels_last", **kwargs):
+        super().__init__(*args, **kwargs)
+        if data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError(f"Unsupported data format: {data_format}")
+        self.data_format = data_format
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels)
+        """
+        if self.data_format == "channels_first":
+            features = features.permute(0, 2, 3, 1)
+            features = super().forward(features)
+            features = features.permute(0, 3, 1, 2)
+        else:
+            features = super().forward(features)
+        return features
+
+
+class DINOv3ConvNextLayer(nn.Module):
+    """This corresponds to the `Block` class in the original implementation.
+
+    There are two equivalent implementations:
+     1) DwConv, LayerNorm (channels_first), Conv, GELU, Conv (all in (N, C, H, W) format)
+     2) DwConv, Permute, LayerNorm (channels_last), Linear, GELU, Linear, Permute
+
+    The authors used (2) as they find it slightly faster in PyTorch.
+
+    Args:
+        config ([`DINOv3ConvNextConfig`]):
+            Model config.
+        channels (`int`):
+            Number of input (and output) channels.
+        drop_path (`float`):
+            Drop path rate. Default: 0.0.
+    """
+
+    def __init__(self, config: DINOv3ConvNextConfig, channels: int, drop_path: float = 0.0):
+        super().__init__()
+        self.depthwise_conv = nn.Conv2d(channels, channels, kernel_size=7, padding=3, groups=channels)
+        self.layer_norm = DINOv3ConvNextLayerNorm(channels, eps=config.layer_norm_eps)
+        self.pointwise_conv1 = nn.Linear(channels, 4 * channels)  # can be seen as a 1x1 conv
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.pointwise_conv2 = nn.Linear(4 * channels, channels)  # can be seen as a 1x1 conv
+        self.gamma = nn.Parameter(torch.full((channels,), config.layer_scale_init_value), requires_grad=True)
+        self.drop_path = DINOv3ConvNextDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features: Tensor of shape (batch_size, channels, height, width)
+        """
+        residual = features
+        features = self.depthwise_conv(features)
+        features = features.permute(0, 2, 3, 1)  # to channels last
+        features = self.layer_norm(features)
+        features = self.pointwise_conv1(features)
+        features = self.activation_fn(features)
+        features = self.pointwise_conv2(features)
+        features = features * self.gamma
+        features = features.permute(0, 3, 1, 2)  # back to channels first
+        features = residual + self.drop_path(features)
+        return features
+
+
+class DINOv3ConvNextStage(nn.Module):
+    """ """
+
+    def __init__(self, config: DINOv3ConvNextConfig, stage_idx: int):
+        super().__init__()
+
+        in_channels = config.hidden_sizes[stage_idx - 1] if stage_idx > 0 else config.num_channels
+        out_channels = config.hidden_sizes[stage_idx]
+
+        if stage_idx == 0:
+            self.downsample_layers = nn.ModuleList(
+                [
+                    nn.Conv2d(config.num_channels, out_channels, kernel_size=4, stride=4),
+                    DINOv3ConvNextLayerNorm(out_channels, eps=config.layer_norm_eps, data_format="channels_first"),
+                ]
+            )
+        else:
+            self.downsample_layers = nn.ModuleList(
+                [
+                    DINOv3ConvNextLayerNorm(in_channels, eps=config.layer_norm_eps, data_format="channels_first"),
+                    nn.Conv2d(in_channels, out_channels, kernel_size=2, stride=2),
+                ]
+            )
+
+        num_stage_layers = config.depths[stage_idx]
+        num_previous_layers = sum(config.depths[:stage_idx])
+        num_total_layers = sum(config.depths)
+        drop_path_rates = np.linspace(0, config.drop_path_rate, num_total_layers).tolist()
+
+        self.layers = nn.ModuleList(
+            [
+                DINOv3ConvNextLayer(config, channels=out_channels, drop_path=drop_path_rates[i])
+                for i in range(num_previous_layers, num_previous_layers + num_stage_layers)
+            ]
+        )
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features: Tensor of shape (batch_size, channels, height, width)
+        """
+        for layer in self.downsample_layers:
+            features = layer(features)
+        for layer in self.layers:
+            features = layer(features)
+        return features
+
+
+@auto_docstring
+class DINOv3ConvNextPreTrainedModel(PreTrainedModel):
+    config: DINOv3ConvNextConfig
+    base_model_prefix = "dinov3_convnext"
+    main_input_name = "pixel_values"
+    _no_split_modules = ["DINOv3ConvNextLayer"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, DINOv3ConvNextLayerNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, DINOv3ConvNextLayer):
+            if module.gamma is not None:
+                module.gamma.data.fill_(self.config.layer_scale_init_value)
+
+
+@auto_docstring
+class DINOv3ConvNextModel(DINOv3ConvNextPreTrainedModel):
+    def __init__(self, config: DINOv3ConvNextConfig):
+        super().__init__(config)
+        self.config = config
+        self.stages = nn.ModuleList([DINOv3ConvNextStage(config, stage_idx) for stage_idx in range(config.num_stages)])
+        self.layer_norm = nn.LayerNorm(config.hidden_sizes[-1], eps=config.layer_norm_eps)  # final norm layer
+        self.pool = nn.AdaptiveAvgPool2d(1)
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self, pixel_values: torch.FloatTensor, output_hidden_states: Optional[bool] = None
+    ) -> BaseModelOutputWithPoolingAndNoAttention:
+        hidden_states = pixel_values
+
+        output_hidden_states = output_hidden_states or self.config.output_hidden_states
+        all_hidden_states = [hidden_states] if output_hidden_states else []
+
+        for stage in self.stages:
+            hidden_states = stage(hidden_states)
+
+            # store intermediate stage outputs
+            if output_hidden_states:
+                all_hidden_states.append(hidden_states)
+
+        # make global representation, a.k.a [CLS] token
+        pooled_output = self.pool(hidden_states)
+
+        # (batch_size, channels, height, width) -> (batch_size, height * width, channels)
+        pooled_output = pooled_output.flatten(2).transpose(1, 2)
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+
+        # concat "cls" and "patch tokens" as (batch_size, 1 + height * width, channels)
+        hidden_states = torch.cat([pooled_output, hidden_states], dim=1)
+        hidden_states = self.layer_norm(hidden_states)
+
+        return BaseModelOutputWithPoolingAndNoAttention(
+            last_hidden_state=hidden_states,
+            pooler_output=hidden_states[:, 0],
+            hidden_states=tuple(all_hidden_states) if output_hidden_states else None,
+        )
+
+
+__all__ = ["DINOv3ConvNextModel", "DINOv3ConvNextPreTrainedModel"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/efficientloftr/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/efficientloftr/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..53e51c91c17b4ff51c672f369e43deb35faa0fea
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/efficientloftr/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_efficientloftr import *
+    from .image_processing_efficientloftr import *
+    from .image_processing_efficientloftr_fast import *
+    from .modeling_efficientloftr import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/efficientloftr/configuration_efficientloftr.py b/venv/lib/python3.13/site-packages/transformers/models/efficientloftr/configuration_efficientloftr.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2dff4de3745aa322357252ffa46b9dbfd13a0d8
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/efficientloftr/configuration_efficientloftr.py
@@ -0,0 +1,199 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+
+
+class EfficientLoFTRConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`EfficientLoFTRFromKeypointMatching`].
+    It is used to instantiate a EfficientLoFTR model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the
+    EfficientLoFTR [zju-community/efficientloftr](https://huggingface.co/zju-community/efficientloftr) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        stage_num_blocks (`List`, *optional*, defaults to [1, 2, 4, 14]):
+            The number of blocks in each stages
+        out_features (`List`, *optional*, defaults to [64, 64, 128, 256]):
+            The number of channels in each stage
+        stage_stride (`List`, *optional*, defaults to [2, 1, 2, 2]):
+            The stride used in each stage
+        hidden_size (`int`, *optional*, defaults to 256):
+            The dimension of the descriptors.
+        activation_function (`str`, *optional*, defaults to `"relu"`):
+            The activation function used in the backbone
+        q_aggregation_kernel_size (`int`, *optional*, defaults to 4):
+            The kernel size of the aggregation of query states in the fusion network
+        kv_aggregation_kernel_size (`int`, *optional*, defaults to 4):
+            The kernel size of the aggregation of key and value states in the fusion network
+        q_aggregation_stride (`int`, *optional*, defaults to 4):
+            The stride of the aggregation of query states in the fusion network
+        kv_aggregation_stride (`int`, *optional*, defaults to 4):
+            The stride of the aggregation of key and value states in the fusion network
+        num_attention_layers (`int`, *optional*, defaults to 4):
+            Number of attention layers in the LocalFeatureTransformer
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            The number of heads in the GNN layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during attention.
+        mlp_activation_function (`str`, *optional*, defaults to `"leaky_relu"`):
+            Activation function used in the attention mlp layer.
+        coarse_matching_skip_softmax (`bool`, *optional*, defaults to `False`):
+            Whether to skip softmax or not at the coarse matching step.
+        coarse_matching_threshold (`float`, *optional*, defaults to 0.2):
+            The threshold for the minimum score required for a match.
+        coarse_matching_temperature (`float`, *optional*, defaults to 0.1):
+            The temperature to apply to the coarse similarity matrix
+        coarse_matching_border_removal (`int`, *optional*, defaults to 2):
+            The size of the border to remove during coarse matching
+        fine_kernel_size (`int`, *optional*, defaults to 8):
+            Kernel size used for the fine feature matching
+        batch_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the batch normalization layers.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        partial_rotary_factor (`float`, *optional*, defaults to 4.0):
+            Dim factor for the RoPE embeddings, in EfficientLoFTR, frequencies should be generated for
+            the whole hidden_size, so this factor is used to compensate.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3', '2d'], with 'default' being the original RoPE implementation.
+                `dim` (`int`): The dimension of the RoPE embeddings.
+        fine_matching_slice_dim (`int`, *optional*, defaults to 8):
+            The size of the slice used to divide the fine features for the first and second fine matching stages.
+        fine_matching_regress_temperature (`float`, *optional*, defaults to 10.0):
+            The temperature to apply to the fine similarity matrix
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
+    Examples:
+        ```python
+        >>> from transformers import EfficientLoFTRConfig, EfficientLoFTRForKeypointMatching
+
+        >>> # Initializing a EfficientLoFTR configuration
+        >>> configuration = EfficientLoFTRConfig()
+
+        >>> # Initializing a model from the EfficientLoFTR configuration
+        >>> model = EfficientLoFTRForKeypointMatching(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+        ```
+    """
+
+    model_type = "efficientloftr"
+
+    def __init__(
+        self,
+        stage_num_blocks: Optional[list[int]] = None,
+        out_features: Optional[list[int]] = None,
+        stage_stride: Optional[list[int]] = None,
+        hidden_size: int = 256,
+        activation_function: str = "relu",
+        q_aggregation_kernel_size: int = 4,
+        kv_aggregation_kernel_size: int = 4,
+        q_aggregation_stride: int = 4,
+        kv_aggregation_stride: int = 4,
+        num_attention_layers: int = 4,
+        num_attention_heads: int = 8,
+        attention_dropout: float = 0.0,
+        attention_bias: bool = False,
+        mlp_activation_function: str = "leaky_relu",
+        coarse_matching_skip_softmax: bool = False,
+        coarse_matching_threshold: float = 0.2,
+        coarse_matching_temperature: float = 0.1,
+        coarse_matching_border_removal: int = 2,
+        fine_kernel_size: int = 8,
+        batch_norm_eps: float = 1e-5,
+        rope_theta: float = 10000.0,
+        partial_rotary_factor: float = 4.0,
+        rope_scaling: Optional[dict] = None,
+        fine_matching_slice_dim: int = 8,
+        fine_matching_regress_temperature: float = 10.0,
+        initializer_range: float = 0.02,
+        **kwargs,
+    ):
+        # Stage level of RepVGG
+        self.stage_num_blocks = stage_num_blocks if stage_num_blocks is not None else [1, 2, 4, 14]
+        self.stage_stride = stage_stride if stage_stride is not None else [2, 1, 2, 2]
+        self.out_features = out_features if out_features is not None else [64, 64, 128, 256]
+        self.stage_in_channels = [1] + self.out_features[:-1]
+
+        # Block level of RepVGG
+        self.stage_block_stride = [
+            [stride] + [1] * (num_blocks - 1) for stride, num_blocks in zip(self.stage_stride, self.stage_num_blocks)
+        ]
+        self.stage_block_out_channels = [
+            [self.out_features[stage_idx]] * num_blocks for stage_idx, num_blocks in enumerate(self.stage_num_blocks)
+        ]
+        self.stage_block_in_channels = [
+            [self.stage_in_channels[stage_idx]] + self.stage_block_out_channels[stage_idx][:-1]
+            for stage_idx in range(len(self.stage_num_blocks))
+        ]
+
+        # Fine matching level of EfficientLoFTR
+        self.fine_fusion_dims = list(reversed(self.out_features))[:-1]
+
+        self.hidden_size = hidden_size
+        if self.hidden_size != self.out_features[-1]:
+            raise ValueError(
+                f"hidden_size should be equal to the last value in out_features. hidden_size = {self.hidden_size}, out_features = {self.out_features[-1]}"
+            )
+
+        self.activation_function = activation_function
+        self.q_aggregation_kernel_size = q_aggregation_kernel_size
+        self.kv_aggregation_kernel_size = kv_aggregation_kernel_size
+        self.q_aggregation_stride = q_aggregation_stride
+        self.kv_aggregation_stride = kv_aggregation_stride
+        self.num_attention_layers = num_attention_layers
+        self.num_attention_heads = num_attention_heads
+        self.attention_dropout = attention_dropout
+        self.attention_bias = attention_bias
+        self.intermediate_size = self.hidden_size * 2
+        self.mlp_activation_function = mlp_activation_function
+        self.coarse_matching_skip_softmax = coarse_matching_skip_softmax
+        self.coarse_matching_threshold = coarse_matching_threshold
+        self.coarse_matching_temperature = coarse_matching_temperature
+        self.coarse_matching_border_removal = coarse_matching_border_removal
+        self.fine_kernel_size = fine_kernel_size
+        self.batch_norm_eps = batch_norm_eps
+        self.fine_matching_slice_dim = fine_matching_slice_dim
+        self.fine_matching_regress_temperature = fine_matching_regress_temperature
+
+        self.num_key_value_heads = num_attention_heads
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling if rope_scaling is not None else {"rope_type": "default"}
+
+        # for compatibility with "default" rope type
+        self.partial_rotary_factor = partial_rotary_factor
+        rope_config_validation(self)
+
+        self.initializer_range = initializer_range
+
+        super().__init__(**kwargs)
+
+
+__all__ = ["EfficientLoFTRConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/efficientloftr/image_processing_efficientloftr.py b/venv/lib/python3.13/site-packages/transformers/models/efficientloftr/image_processing_efficientloftr.py
new file mode 100644
index 0000000000000000000000000000000000000000..58ce0e96f5b8664720505d89ddd1f15ace7e5e21
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/efficientloftr/image_processing_efficientloftr.py
@@ -0,0 +1,461 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for SuperPoint."""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ... import is_torch_available, is_vision_available
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import resize, to_channel_dimension_format
+from ...image_utils import (
+    ChannelDimension,
+    ImageInput,
+    ImageType,
+    PILImageResampling,
+    get_image_type,
+    infer_channel_dimension_format,
+    is_pil_image,
+    is_scaled_image,
+    is_valid_image,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, logging, requires_backends
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    import PIL
+    from PIL import Image, ImageDraw
+
+    from .modeling_efficientloftr import KeypointMatchingOutput
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.superpoint.image_processing_superpoint.is_grayscale
+def is_grayscale(
+    image: np.ndarray,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+):
+    if input_data_format == ChannelDimension.FIRST:
+        if image.shape[0] == 1:
+            return True
+        return np.all(image[0, ...] == image[1, ...]) and np.all(image[1, ...] == image[2, ...])
+    elif input_data_format == ChannelDimension.LAST:
+        if image.shape[-1] == 1:
+            return True
+        return np.all(image[..., 0] == image[..., 1]) and np.all(image[..., 1] == image[..., 2])
+
+
+# Copied from transformers.models.superpoint.image_processing_superpoint.convert_to_grayscale
+def convert_to_grayscale(
+    image: ImageInput,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> ImageInput:
+    """
+    Converts an image to grayscale format using the NTSC formula. Only support numpy and PIL Image. TODO support torch
+    and tensorflow grayscale conversion
+
+    This function is supposed to return a 1-channel image, but it returns a 3-channel image with the same value in each
+    channel, because of an issue that is discussed in :
+    https://github.com/huggingface/transformers/pull/25786#issuecomment-1730176446
+
+    Args:
+        image (Image):
+            The image to convert.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format for the input image.
+    """
+    requires_backends(convert_to_grayscale, ["vision"])
+
+    if isinstance(image, np.ndarray):
+        if is_grayscale(image, input_data_format=input_data_format):
+            return image
+        if input_data_format == ChannelDimension.FIRST:
+            gray_image = image[0, ...] * 0.2989 + image[1, ...] * 0.5870 + image[2, ...] * 0.1140
+            gray_image = np.stack([gray_image] * 3, axis=0)
+        elif input_data_format == ChannelDimension.LAST:
+            gray_image = image[..., 0] * 0.2989 + image[..., 1] * 0.5870 + image[..., 2] * 0.1140
+            gray_image = np.stack([gray_image] * 3, axis=-1)
+        return gray_image
+
+    if not isinstance(image, PIL.Image.Image):
+        return image
+
+    image = image.convert("L")
+    return image
+
+
+# Copied from transformers.models.superglue.image_processing_superglue.validate_and_format_image_pairs
+def validate_and_format_image_pairs(images: ImageInput):
+    error_message = (
+        "Input images must be a one of the following :",
+        " - A pair of PIL images.",
+        " - A pair of 3D arrays.",
+        " - A list of pairs of PIL images.",
+        " - A list of pairs of 3D arrays.",
+    )
+
+    def _is_valid_image(image):
+        """images is a PIL Image or a 3D array."""
+        return is_pil_image(image) or (
+            is_valid_image(image) and get_image_type(image) != ImageType.PIL and len(image.shape) == 3
+        )
+
+    if isinstance(images, list):
+        if len(images) == 2 and all((_is_valid_image(image)) for image in images):
+            return images
+        if all(
+            isinstance(image_pair, list)
+            and len(image_pair) == 2
+            and all(_is_valid_image(image) for image in image_pair)
+            for image_pair in images
+        ):
+            return [image for image_pair in images for image in image_pair]
+    raise ValueError(error_message)
+
+
+class EfficientLoFTRImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a EfficientLoFTR image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden
+            by `do_resize` in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"height": 480, "width": 640}`):
+            Resolution of the output image after `resize` is applied. Only has an effect if `do_resize` is set to
+            `True`. Can be overridden by `size` in the `preprocess` method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_grayscale (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: float = 1 / 255,
+        do_grayscale: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 480, "width": 640}
+        size = get_size_dict(size, default_to_square=False)
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_grayscale = do_grayscale
+
+    # Copied from transformers.models.superpoint.image_processing_superpoint.SuperPointImageProcessor.resize
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ):
+        """
+        Resize an image.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Dictionary of the form `{"height": int, "width": int}`, specifying the size of the output image.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the output image. If not provided, it will be inferred from the input
+                image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        size = get_size_dict(size, default_to_square=False)
+
+        return resize(
+            image,
+            size=(size["height"], size["width"]),
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    # Copied from transformers.models.superglue.image_processing_superglue.SuperGlueImageProcessor.preprocess
+    def preprocess(
+        self,
+        images,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_grayscale: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image pairs to preprocess. Expects either a list of 2 images or a list of list of 2 images list with
+                pixel values ranging from 0 to 255. If passing in images with pixel values between 0 and 1, set
+                `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the output image after `resize` has been applied. If `size["shortest_edge"]` >= 384, the image
+                is resized to `(size["shortest_edge"], size["shortest_edge"])`. Otherwise, the smaller edge of the
+                image will be matched to `int(size["shortest_edge"]/ crop_pct)`, after which the image is cropped to
+                `(size["shortest_edge"], size["shortest_edge"])`. Only has an effect if `do_resize` is set to `True`.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of `PILImageResampling`, filters. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_grayscale (`bool`, *optional*, defaults to `self.do_grayscale`):
+                Whether to convert the image to grayscale.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_grayscale = do_grayscale if do_grayscale is not None else self.do_grayscale
+
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False)
+
+        # Validate and convert the input images into a flattened list of images for all subsequent processing steps.
+        images = validate_and_format_image_pairs(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        validate_preprocess_arguments(
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+        )
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        all_images = []
+        for image in images:
+            if do_resize:
+                image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+
+            if do_rescale:
+                image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_grayscale:
+                image = convert_to_grayscale(image, input_data_format=input_data_format)
+
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            all_images.append(image)
+
+        # Convert back the flattened list of images into a list of pairs of images.
+        image_pairs = [all_images[i : i + 2] for i in range(0, len(all_images), 2)]
+
+        data = {"pixel_values": image_pairs}
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def post_process_keypoint_matching(
+        self,
+        outputs: "KeypointMatchingOutput",
+        target_sizes: Union[TensorType, list[tuple]],
+        threshold: float = 0.0,
+    ) -> list[dict[str, torch.Tensor]]:
+        """
+        Converts the raw output of [`KeypointMatchingOutput`] into lists of keypoints, scores and descriptors
+        with coordinates absolute to the original image sizes.
+        Args:
+            outputs ([`KeypointMatchingOutput`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` or `List[Tuple[Tuple[int, int]]]`, *optional*):
+                Tensor of shape `(batch_size, 2, 2)` or list of tuples of tuples (`Tuple[int, int]`) containing the
+                target size `(height, width)` of each image in the batch. This must be the original image size (before
+                any processing).
+            threshold (`float`, *optional*, defaults to 0.0):
+                Threshold to filter out the matches with low scores.
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the keypoints in the first and second image
+            of the pair, the matching scores and the matching indices.
+        """
+        if outputs.matches.shape[0] != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the mask")
+        if not all(len(target_size) == 2 for target_size in target_sizes):
+            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+        if isinstance(target_sizes, list):
+            image_pair_sizes = torch.tensor(target_sizes, device=outputs.matches.device)
+        else:
+            if target_sizes.shape[1] != 2 or target_sizes.shape[2] != 2:
+                raise ValueError(
+                    "Each element of target_sizes must contain the size (h, w) of each image of the batch"
+                )
+            image_pair_sizes = target_sizes
+
+        keypoints = outputs.keypoints.clone()
+        keypoints = keypoints * image_pair_sizes.flip(-1).reshape(-1, 2, 1, 2)
+        keypoints = keypoints.to(torch.int32)
+
+        results = []
+        for keypoints_pair, matches, scores in zip(keypoints, outputs.matches, outputs.matching_scores):
+            # Filter out matches with low scores
+            valid_matches = torch.logical_and(scores > threshold, matches > -1)
+
+            matched_keypoints0 = keypoints_pair[0][valid_matches[0]]
+            matched_keypoints1 = keypoints_pair[1][valid_matches[1]]
+            matching_scores = scores[0][valid_matches[0]]
+
+            results.append(
+                {
+                    "keypoints0": matched_keypoints0,
+                    "keypoints1": matched_keypoints1,
+                    "matching_scores": matching_scores,
+                }
+            )
+
+        return results
+
+    def visualize_keypoint_matching(
+        self,
+        images: ImageInput,
+        keypoint_matching_output: list[dict[str, torch.Tensor]],
+    ) -> list["Image.Image"]:
+        """
+        Plots the image pairs side by side with the detected keypoints as well as the matching between them.
+
+        Args:
+            images (`ImageInput`):
+                Image pairs to plot. Same as `EfficientLoFTRImageProcessor.preprocess`. Expects either a list of 2
+                images or a list of list of 2 images list with pixel values ranging from 0 to 255.
+            keypoint_matching_output (List[Dict[str, torch.Tensor]]]):
+                A post processed keypoint matching output
+
+        Returns:
+            `List[PIL.Image.Image]`: A list of PIL images, each containing the image pairs side by side with the detected
+            keypoints as well as the matching between them.
+        """
+        images = validate_and_format_image_pairs(images)
+        images = [to_numpy_array(image) for image in images]
+        image_pairs = [images[i : i + 2] for i in range(0, len(images), 2)]
+
+        results = []
+        for image_pair, pair_output in zip(image_pairs, keypoint_matching_output):
+            height0, width0 = image_pair[0].shape[:2]
+            height1, width1 = image_pair[1].shape[:2]
+            plot_image = np.zeros((max(height0, height1), width0 + width1, 3), dtype=np.uint8)
+            plot_image[:height0, :width0] = image_pair[0]
+            plot_image[:height1, width0:] = image_pair[1]
+
+            plot_image_pil = Image.fromarray(plot_image)
+            draw = ImageDraw.Draw(plot_image_pil)
+
+            keypoints0_x, keypoints0_y = pair_output["keypoints0"].unbind(1)
+            keypoints1_x, keypoints1_y = pair_output["keypoints1"].unbind(1)
+            for keypoint0_x, keypoint0_y, keypoint1_x, keypoint1_y, matching_score in zip(
+                keypoints0_x, keypoints0_y, keypoints1_x, keypoints1_y, pair_output["matching_scores"]
+            ):
+                color = self._get_color(matching_score)
+                draw.line(
+                    (keypoint0_x, keypoint0_y, keypoint1_x + width0, keypoint1_y),
+                    fill=color,
+                    width=3,
+                )
+                draw.ellipse((keypoint0_x - 2, keypoint0_y - 2, keypoint0_x + 2, keypoint0_y + 2), fill="black")
+                draw.ellipse(
+                    (keypoint1_x + width0 - 2, keypoint1_y - 2, keypoint1_x + width0 + 2, keypoint1_y + 2),
+                    fill="black",
+                )
+
+            results.append(plot_image_pil)
+        return results
+
+    def _get_color(self, score):
+        """Maps a score to a color."""
+        r = int(255 * (1 - score))
+        g = int(255 * score)
+        b = 0
+        return (r, g, b)
+
+
+__all__ = ["EfficientLoFTRImageProcessor"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/efficientloftr/image_processing_efficientloftr_fast.py b/venv/lib/python3.13/site-packages/transformers/models/efficientloftr/image_processing_efficientloftr_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..1463ef405f37db1fad1f7d5435494197f5bef9f9
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/efficientloftr/image_processing_efficientloftr_fast.py
@@ -0,0 +1,314 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for EfficientLoFTR."""
+
+from typing import TYPE_CHECKING, Optional, Union
+
+import torch
+from PIL import Image, ImageDraw
+
+from ...image_processing_utils import BatchFeature
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    group_images_by_shape,
+    reorder_images,
+)
+from ...image_utils import (
+    ImageInput,
+    ImageType,
+    PILImageResampling,
+    SizeDict,
+    get_image_type,
+    is_pil_image,
+    is_valid_image,
+)
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    auto_docstring,
+)
+
+
+if TYPE_CHECKING:
+    from .modeling_efficientloftr import KeypointMatchingOutput
+
+import torchvision.transforms.v2.functional as F
+
+
+def _is_valid_image(image):
+    return is_pil_image(image) or (
+        is_valid_image(image) and get_image_type(image) != ImageType.PIL and len(image.shape) == 3
+    )
+
+
+def flatten_pair_images(images):
+    # Handle the pair validation and flattening similar to slow processor
+    if isinstance(images, list):
+        if len(images) == 2 and all((_is_valid_image(image) or isinstance(image, torch.Tensor)) for image in images):
+            # Single pair of images - keep as is, they'll be processed by the base class
+            return images
+        elif all(
+            isinstance(image_pair, list)
+            and len(image_pair) == 2
+            and all(_is_valid_image(image) or isinstance(image, torch.Tensor) for image in image_pair)
+            for image_pair in images
+        ):
+            # Multiple pairs - flatten them
+            images = [image for image_pair in images for image in image_pair]
+            return images
+    raise ValueError(
+        "Input images must be a one of the following :",
+        " - A pair of PIL images.",
+        " - A pair of 3D arrays.",
+        " - A list of pairs of PIL images.",
+        " - A list of pairs of 3D arrays.",
+    )
+
+
+def is_grayscale(
+    image: "torch.Tensor",
+):
+    """Checks if an image is grayscale (all RGB channels are identical)."""
+    if image.ndim < 3 or image.shape[0 if image.ndim == 3 else 1] == 1:
+        return True
+    return torch.all(image[..., 0, :, :] == image[..., 1, :, :]) and torch.all(
+        image[..., 1, :, :] == image[..., 2, :, :]
+    )
+
+
+def convert_to_grayscale(
+    image: "torch.Tensor",
+) -> "torch.Tensor":
+    """
+    Converts an image to grayscale format using the NTSC formula. Only support torch.Tensor.
+
+    This function is supposed to return a 1-channel image, but it returns a 3-channel image with the same value in each
+    channel, because of an issue that is discussed in :
+    https://github.com/huggingface/transformers/pull/25786#issuecomment-1730176446
+
+    Args:
+        image (torch.Tensor):
+            The image to convert.
+    """
+    if is_grayscale(image):
+        return image
+    return F.rgb_to_grayscale(image, num_output_channels=3)
+
+
+class EfficientLoFTRFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    r"""
+    do_grayscale (`bool`, *optional*, defaults to `True`):
+        Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
+    """
+
+    do_grayscale: Optional[bool] = True
+
+
+@auto_docstring
+class EfficientLoFTRImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BILINEAR
+    size = {"height": 480, "width": 640}
+    default_to_square = False
+    do_resize = True
+    do_rescale = True
+    rescale_factor = 1 / 255
+    do_normalize = None
+    valid_kwargs = EfficientLoFTRFastImageProcessorKwargs
+
+    def __init__(self, **kwargs: Unpack[EfficientLoFTRFastImageProcessorKwargs]):
+        super().__init__(**kwargs)
+
+    @auto_docstring
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[EfficientLoFTRFastImageProcessorKwargs]) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+
+    def _prepare_images_structure(
+        self,
+        images: ImageInput,
+        **kwargs,
+    ) -> ImageInput:
+        # we need to handle image pairs validation and flattening
+        return flatten_pair_images(images)
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        size: Union[dict[str, int], SizeDict],
+        rescale_factor: float,
+        do_rescale: bool,
+        do_resize: bool,
+        interpolation: Optional["F.InterpolationMode"],
+        do_grayscale: bool,
+        disable_grouping: bool,
+        return_tensors: Union[str, TensorType],
+        **kwargs,
+    ) -> BatchFeature:
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(stacked_images, size=size, interpolation=interpolation)
+            processed_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(processed_images_grouped, grouped_images_index)
+
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_rescale:
+                stacked_images = self.rescale(stacked_images, rescale_factor)
+            if do_grayscale:
+                stacked_images = convert_to_grayscale(stacked_images)
+            processed_images_grouped[shape] = stacked_images
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+
+        # Convert back to pairs format
+        image_pairs = [processed_images[i : i + 2] for i in range(0, len(processed_images), 2)]
+
+        # Stack each pair into a single tensor to match slow processor format
+        stacked_pairs = [torch.stack(pair, dim=0) for pair in image_pairs]
+
+        # Return in same format as slow processor
+        image_pairs = torch.stack(stacked_pairs, dim=0) if return_tensors else stacked_pairs
+
+        return BatchFeature(data={"pixel_values": image_pairs})
+
+    def post_process_keypoint_matching(
+        self,
+        outputs: "KeypointMatchingOutput",
+        target_sizes: Union[TensorType, list[tuple]],
+        threshold: float = 0.0,
+    ) -> list[dict[str, torch.Tensor]]:
+        """
+        Converts the raw output of [`KeypointMatchingOutput`] into lists of keypoints, scores and descriptors
+        with coordinates absolute to the original image sizes.
+        Args:
+            outputs ([`KeypointMatchingOutput`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` or `List[Tuple[Tuple[int, int]]]`, *optional*):
+                Tensor of shape `(batch_size, 2, 2)` or list of tuples of tuples (`Tuple[int, int]`) containing the
+                target size `(height, width)` of each image in the batch. This must be the original image size (before
+                any processing).
+            threshold (`float`, *optional*, defaults to 0.0):
+                Threshold to filter out the matches with low scores.
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the keypoints in the first and second image
+            of the pair, the matching scores and the matching indices.
+        """
+        if outputs.matches.shape[0] != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the mask")
+        if not all(len(target_size) == 2 for target_size in target_sizes):
+            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+        if isinstance(target_sizes, list):
+            image_pair_sizes = torch.tensor(target_sizes, device=outputs.matches.device)
+        else:
+            if target_sizes.shape[1] != 2 or target_sizes.shape[2] != 2:
+                raise ValueError(
+                    "Each element of target_sizes must contain the size (h, w) of each image of the batch"
+                )
+            image_pair_sizes = target_sizes
+
+        keypoints = outputs.keypoints.clone()
+        keypoints = keypoints * image_pair_sizes.flip(-1).reshape(-1, 2, 1, 2)
+        keypoints = keypoints.to(torch.int32)
+
+        results = []
+        for keypoints_pair, matches, scores in zip(keypoints, outputs.matches, outputs.matching_scores):
+            # Filter out matches with low scores
+            valid_matches = torch.logical_and(scores > threshold, matches > -1)
+
+            matched_keypoints0 = keypoints_pair[0][valid_matches[0]]
+            matched_keypoints1 = keypoints_pair[1][valid_matches[1]]
+            matching_scores = scores[0][valid_matches[0]]
+
+            results.append(
+                {
+                    "keypoints0": matched_keypoints0,
+                    "keypoints1": matched_keypoints1,
+                    "matching_scores": matching_scores,
+                }
+            )
+
+        return results
+
+    def visualize_keypoint_matching(
+        self,
+        images,
+        keypoint_matching_output: list[dict[str, torch.Tensor]],
+    ) -> list["Image.Image"]:
+        """
+        Plots the image pairs side by side with the detected keypoints as well as the matching between them.
+
+        Args:
+            images:
+                Image pairs to plot. Same as `EfficientLoFTRImageProcessor.preprocess`. Expects either a list of 2
+                images or a list of list of 2 images list with pixel values ranging from 0 to 255.
+            keypoint_matching_output (List[Dict[str, torch.Tensor]]]):
+                A post processed keypoint matching output
+
+        Returns:
+            `List[PIL.Image.Image]`: A list of PIL images, each containing the image pairs side by side with the detected
+            keypoints as well as the matching between them.
+        """
+        from ...image_utils import to_numpy_array
+        from .image_processing_efficientloftr import validate_and_format_image_pairs
+
+        images = validate_and_format_image_pairs(images)
+        images = [to_numpy_array(image) for image in images]
+        image_pairs = [images[i : i + 2] for i in range(0, len(images), 2)]
+
+        results = []
+        for image_pair, pair_output in zip(image_pairs, keypoint_matching_output):
+            height0, width0 = image_pair[0].shape[:2]
+            height1, width1 = image_pair[1].shape[:2]
+            plot_image = torch.zeros((max(height0, height1), width0 + width1, 3), dtype=torch.uint8)
+            plot_image[:height0, :width0] = torch.from_numpy(image_pair[0])
+            plot_image[:height1, width0:] = torch.from_numpy(image_pair[1])
+
+            plot_image_pil = Image.fromarray(plot_image.numpy())
+            draw = ImageDraw.Draw(plot_image_pil)
+
+            keypoints0_x, keypoints0_y = pair_output["keypoints0"].unbind(1)
+            keypoints1_x, keypoints1_y = pair_output["keypoints1"].unbind(1)
+            for keypoint0_x, keypoint0_y, keypoint1_x, keypoint1_y, matching_score in zip(
+                keypoints0_x, keypoints0_y, keypoints1_x, keypoints1_y, pair_output["matching_scores"]
+            ):
+                color = self._get_color(matching_score)
+                draw.line(
+                    (keypoint0_x, keypoint0_y, keypoint1_x + width0, keypoint1_y),
+                    fill=color,
+                    width=3,
+                )
+                draw.ellipse((keypoint0_x - 2, keypoint0_y - 2, keypoint0_x + 2, keypoint0_y + 2), fill="black")
+                draw.ellipse(
+                    (keypoint1_x + width0 - 2, keypoint1_y - 2, keypoint1_x + width0 + 2, keypoint1_y + 2),
+                    fill="black",
+                )
+
+            results.append(plot_image_pil)
+        return results
+
+    def _get_color(self, score):
+        """Maps a score to a color."""
+        r = int(255 * (1 - score))
+        g = int(255 * score)
+        b = 0
+        return (r, g, b)
+
+
+__all__ = ["EfficientLoFTRImageProcessorFast"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/efficientloftr/modeling_efficientloftr.py b/venv/lib/python3.13/site-packages/transformers/models/efficientloftr/modeling_efficientloftr.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fb93d48eb9bbef86796b21f8b1e0af9f275bba1
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/efficientloftr/modeling_efficientloftr.py
@@ -0,0 +1,1325 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2CLS, ACT2FN
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BackboneOutput
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...pytorch_utils import compile_compatible_method_lru_cache
+from ...utils import (
+    ModelOutput,
+    TransformersKwargs,
+    auto_docstring,
+    can_return_tuple,
+    torch_int,
+)
+from ...utils.generic import check_model_inputs
+from .configuration_efficientloftr import EfficientLoFTRConfig
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for outputs of keypoint matching models. Due to the nature of keypoint detection and matching, the number
+    of keypoints is not fixed and can vary from image to image, which makes batching non-trivial. In the batch of
+    images, the maximum number of matches is set as the dimension of the matches and matching scores. The mask tensor is
+    used to indicate which values in the keypoints, matches and matching_scores tensors are keypoint matching
+    information.
+    """
+)
+class KeypointMatchingOutput(ModelOutput):
+    r"""
+    matches (`torch.FloatTensor` of shape `(batch_size, 2, num_matches)`):
+        Index of keypoint matched in the other image.
+    matching_scores (`torch.FloatTensor` of shape `(batch_size, 2, num_matches)`):
+        Scores of predicted matches.
+    keypoints (`torch.FloatTensor` of shape `(batch_size, num_keypoints, 2)`):
+        Absolute (x, y) coordinates of predicted keypoints in a given image.
+    hidden_states (`tuple[torch.FloatTensor, ...]`, *optional*):
+        Tuple of `torch.FloatTensor` (one for the output of each stage) of shape `(batch_size, 2, num_channels,
+        num_keypoints)`, returned when `output_hidden_states=True` is passed or when
+        `config.output_hidden_states=True`)
+    attentions (`tuple[torch.FloatTensor, ...]`, *optional*):
+        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, 2, num_heads, num_keypoints,
+        num_keypoints)`, returned when `output_attentions=True` is passed or when `config.output_attentions=True`)
+    """
+
+    matches: Optional[torch.FloatTensor] = None
+    matching_scores: Optional[torch.FloatTensor] = None
+    keypoints: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+@compile_compatible_method_lru_cache(maxsize=32)
+def compute_embeddings(inv_freq: torch.Tensor, embed_height: int, embed_width: int, hidden_size: int) -> torch.Tensor:
+    i_indices = torch.ones(embed_height, embed_width, dtype=inv_freq.dtype, device=inv_freq.device)
+    j_indices = torch.ones(embed_height, embed_width, dtype=inv_freq.dtype, device=inv_freq.device)
+    i_indices = i_indices.cumsum(0).unsqueeze(-1)
+    j_indices = j_indices.cumsum(1).unsqueeze(-1)
+
+    emb = torch.zeros(1, embed_height, embed_width, hidden_size // 2, dtype=inv_freq.dtype, device=inv_freq.device)
+    emb[:, :, :, 0::2] = i_indices * inv_freq
+    emb[:, :, :, 1::2] = j_indices * inv_freq
+
+    return emb
+
+
+class EfficientLoFTRRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: EfficientLoFTRConfig, device=None):
+        super().__init__()
+        self.config = config
+        self.rope_type = config.rope_scaling["rope_type"]
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, _ = self.rope_init_fn(self.config, device)
+        inv_freq_expanded = inv_freq[None, None, None, :].float().expand(1, 1, 1, -1)
+
+        self.register_buffer("inv_freq", inv_freq_expanded, persistent=False)
+
+    @torch.no_grad()
+    def forward(
+        self, x: torch.Tensor, position_ids: Optional[tuple[torch.LongTensor, torch.LongTensor]] = None
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        feats_height, feats_width = x.shape[-2:]
+        embed_height = (feats_height - self.config.q_aggregation_kernel_size) // self.config.q_aggregation_stride + 1
+        embed_width = (feats_width - self.config.q_aggregation_kernel_size) // self.config.q_aggregation_stride + 1
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            emb = compute_embeddings(self.inv_freq, embed_height, embed_width, self.config.hidden_size)
+            sin = emb.sin()
+            cos = emb.cos()
+
+        sin = sin.repeat_interleave(2, dim=-1)
+        cos = cos.repeat_interleave(2, dim=-1)
+
+        sin = sin.to(device=x.device, dtype=x.dtype)
+        cos = cos.to(device=x.device, dtype=x.dtype)
+
+        return cos, sin
+
+
+# Copied from transformers.models.rt_detr_v2.modeling_rt_detr_v2.RTDetrV2ConvNormLayer with RTDetrV2->EfficientLoFTR
+class EfficientLoFTRConvNormLayer(nn.Module):
+    def __init__(self, config, in_channels, out_channels, kernel_size, stride, padding=None, activation=None):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding=(kernel_size - 1) // 2 if padding is None else padding,
+            bias=False,
+        )
+        self.norm = nn.BatchNorm2d(out_channels, config.batch_norm_eps)
+        self.activation = nn.Identity() if activation is None else ACT2CLS[activation]()
+
+    def forward(self, hidden_state):
+        hidden_state = self.conv(hidden_state)
+        hidden_state = self.norm(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        return hidden_state
+
+
+class EfficientLoFTRRepVGGBlock(GradientCheckpointingLayer):
+    """
+    RepVGG architecture block introduced by the work "RepVGG: Making VGG-style ConvNets Great Again".
+    """
+
+    def __init__(self, config: EfficientLoFTRConfig, stage_idx: int, block_idx: int):
+        super().__init__()
+        in_channels = config.stage_block_in_channels[stage_idx][block_idx]
+        out_channels = config.stage_block_out_channels[stage_idx][block_idx]
+        stride = config.stage_block_stride[stage_idx][block_idx]
+        activation = config.activation_function
+        self.conv1 = EfficientLoFTRConvNormLayer(
+            config, in_channels, out_channels, kernel_size=3, stride=stride, padding=1
+        )
+        self.conv2 = EfficientLoFTRConvNormLayer(
+            config, in_channels, out_channels, kernel_size=1, stride=stride, padding=0
+        )
+        self.identity = nn.BatchNorm2d(in_channels) if in_channels == out_channels and stride == 1 else None
+        self.activation = nn.Identity() if activation is None else ACT2FN[activation]
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if self.identity is not None:
+            identity_out = self.identity(hidden_states)
+        else:
+            identity_out = 0
+        hidden_states = self.conv1(hidden_states) + self.conv2(hidden_states) + identity_out
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class EfficientLoFTRRepVGGStage(nn.Module):
+    def __init__(self, config: EfficientLoFTRConfig, stage_idx: int):
+        super().__init__()
+        self.blocks = nn.ModuleList([])
+        for block_idx in range(config.stage_num_blocks[stage_idx]):
+            self.blocks.append(
+                EfficientLoFTRRepVGGBlock(
+                    config,
+                    stage_idx,
+                    block_idx,
+                )
+            )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        for block in self.blocks:
+            hidden_states = block(hidden_states)
+        return hidden_states
+
+
+class EfficientLoFTRepVGG(nn.Module):
+    def __init__(self, config: EfficientLoFTRConfig):
+        super().__init__()
+
+        self.stages = nn.ModuleList([])
+
+        for stage_idx in range(len(config.stage_stride)):
+            stage = EfficientLoFTRRepVGGStage(config, stage_idx)
+            self.stages.append(stage)
+
+    def forward(self, hidden_states: torch.Tensor) -> list[torch.Tensor]:
+        outputs = []
+        for stage in self.stages:
+            hidden_states = stage(hidden_states)
+            outputs.append(hidden_states)
+
+        # Exclude first stage in outputs
+        outputs = outputs[1:]
+        return outputs
+
+
+class EfficientLoFTRAggregationLayer(nn.Module):
+    def __init__(self, config: EfficientLoFTRConfig):
+        super().__init__()
+
+        hidden_size = config.hidden_size
+
+        self.q_aggregation = nn.Conv2d(
+            hidden_size,
+            hidden_size,
+            kernel_size=config.q_aggregation_kernel_size,
+            padding=0,
+            stride=config.q_aggregation_stride,
+            bias=False,
+            groups=hidden_size,
+        )
+        self.kv_aggregation = torch.nn.MaxPool2d(
+            kernel_size=config.kv_aggregation_kernel_size, stride=config.kv_aggregation_stride
+        )
+        self.norm = nn.LayerNorm(hidden_size)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        query_states = hidden_states
+        is_cross_attention = encoder_hidden_states is not None
+        kv_states = encoder_hidden_states if is_cross_attention else hidden_states
+
+        query_states = self.q_aggregation(query_states)
+        kv_states = self.kv_aggregation(kv_states)
+        query_states = query_states.permute(0, 2, 3, 1)
+        kv_states = kv_states.permute(0, 2, 3, 1)
+        hidden_states = self.norm(query_states)
+        encoder_hidden_states = self.norm(kv_states)
+        return hidden_states, encoder_hidden_states
+
+
+# Copied from transformers.models.cohere.modeling_cohere.rotate_half
+def rotate_half(x):
+    # Split and rotate. Note that this function is different from e.g. Llama.
+    x1 = x[..., ::2]
+    x2 = x[..., 1::2]
+    rot_x = torch.stack([-x2, x1], dim=-1).flatten(-2)
+    return rot_x
+
+
+# Copied from transformers.models.cohere.modeling_cohere.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    dtype = q.dtype
+    q = q.float()
+    k = k.float()
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed.to(dtype=dtype), k_embed.to(dtype=dtype)
+
+
+# Copied from transformers.models.cohere.modeling_cohere.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+# Copied from transformers.models.llama.modeling_llama.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class EfficientLoFTRAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: EfficientLoFTRConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = False
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        batch_size, seq_len, dim = hidden_states.shape
+        input_shape = hidden_states.shape[:-1]
+
+        query_states = self.q_proj(hidden_states).view(batch_size, seq_len, -1, dim)
+
+        is_cross_attention = encoder_hidden_states is not None
+        current_states = encoder_hidden_states if is_cross_attention else hidden_states
+
+        key_states = self.k_proj(current_states).view(batch_size, seq_len, -1, dim)
+        value_states = self.v_proj(current_states).view(batch_size, seq_len, -1, self.head_dim).transpose(1, 2)
+
+        if position_embeddings is not None:
+            cos, sin = position_embeddings
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, unsqueeze_dim=2)
+
+        query_states = query_states.view(batch_size, seq_len, -1, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, seq_len, -1, self.head_dim).transpose(1, 2)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask=None,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class EfficientLoFTRMLP(nn.Module):
+    def __init__(self, config: EfficientLoFTRConfig):
+        super().__init__()
+        hidden_size = config.hidden_size
+        intermediate_size = config.intermediate_size
+        self.fc1 = nn.Linear(hidden_size * 2, intermediate_size, bias=False)
+        self.activation = ACT2FN[config.mlp_activation_function]
+        self.fc2 = nn.Linear(intermediate_size, hidden_size, bias=False)
+        self.layer_norm = nn.LayerNorm(hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        return hidden_states
+
+
+class EfficientLoFTRAggregatedAttention(nn.Module):
+    def __init__(self, config: EfficientLoFTRConfig, layer_idx: int):
+        super().__init__()
+
+        self.q_aggregation_kernel_size = config.q_aggregation_kernel_size
+        self.aggregation = EfficientLoFTRAggregationLayer(config)
+        self.attention = EfficientLoFTRAttention(config, layer_idx)
+        self.mlp = EfficientLoFTRMLP(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        batch_size, embed_dim, _, _ = hidden_states.shape
+
+        # Aggregate features
+        aggregated_hidden_states, aggregated_encoder_hidden_states = self.aggregation(
+            hidden_states, encoder_hidden_states
+        )
+        _, aggregated_h, aggregated_w, _ = aggregated_hidden_states.shape
+
+        # Multi-head attention
+        aggregated_hidden_states = aggregated_hidden_states.reshape(batch_size, -1, embed_dim)
+        aggregated_encoder_hidden_states = aggregated_encoder_hidden_states.reshape(batch_size, -1, embed_dim)
+        attn_output, _ = self.attention(
+            aggregated_hidden_states,
+            aggregated_encoder_hidden_states,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+
+        # Upsample features
+        # (batch_size, seq_len, embed_dim) -> (batch_size, embed_dim, h, w) with seq_len = h * w
+        attn_output = attn_output.permute(0, 2, 1)
+        attn_output = attn_output.reshape(batch_size, embed_dim, aggregated_h, aggregated_w)
+        attn_output = torch.nn.functional.interpolate(
+            attn_output, scale_factor=self.q_aggregation_kernel_size, mode="bilinear", align_corners=False
+        )
+        intermediate_states = torch.cat([hidden_states, attn_output], dim=1)
+        intermediate_states = intermediate_states.permute(0, 2, 3, 1)
+        output_states = self.mlp(intermediate_states)
+        output_states = output_states.permute(0, 3, 1, 2)
+
+        hidden_states = hidden_states + output_states
+
+        return hidden_states
+
+
+class EfficientLoFTRLocalFeatureTransformerLayer(GradientCheckpointingLayer):
+    def __init__(self, config: EfficientLoFTRConfig, layer_idx: int):
+        super().__init__()
+
+        self.self_attention = EfficientLoFTRAggregatedAttention(config, layer_idx)
+        self.cross_attention = EfficientLoFTRAggregatedAttention(config, layer_idx)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        batch_size, _, embed_dim, height, width = hidden_states.shape
+
+        hidden_states = hidden_states.reshape(-1, embed_dim, height, width)
+        hidden_states = self.self_attention(hidden_states, position_embeddings=position_embeddings, **kwargs)
+
+        ###
+        # Implementation of a bug in the original implementation regarding the cross-attention
+        # See : https://github.com/zju3dv/MatchAnything/issues/26
+        hidden_states = hidden_states.reshape(-1, 2, embed_dim, height, width)
+        features_0 = hidden_states[:, 0]
+        features_1 = hidden_states[:, 1]
+        features_0 = self.cross_attention(features_0, features_1, **kwargs)
+        features_1 = self.cross_attention(features_1, features_0, **kwargs)
+        hidden_states = torch.stack((features_0, features_1), dim=1)
+        ###
+
+        return hidden_states
+
+
+class EfficientLoFTRLocalFeatureTransformer(nn.Module):
+    def __init__(self, config: EfficientLoFTRConfig):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [
+                EfficientLoFTRLocalFeatureTransformerLayer(config, layer_idx=i)
+                for i in range(config.num_attention_layers)
+            ]
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        for layer in self.layers:
+            hidden_states = layer(hidden_states, position_embeddings=position_embeddings, **kwargs)
+        return hidden_states
+
+
+class EfficientLoFTROutConvBlock(nn.Module):
+    def __init__(self, config: EfficientLoFTRConfig, hidden_size: int, intermediate_size: int):
+        super().__init__()
+
+        self.out_conv1 = nn.Conv2d(hidden_size, intermediate_size, kernel_size=1, stride=1, padding=0, bias=False)
+        self.out_conv2 = nn.Conv2d(
+            intermediate_size, intermediate_size, kernel_size=3, stride=1, padding=1, bias=False
+        )
+        self.batch_norm = nn.BatchNorm2d(intermediate_size)
+        self.activation = ACT2CLS[config.mlp_activation_function]()
+        self.out_conv3 = nn.Conv2d(intermediate_size, hidden_size, kernel_size=3, stride=1, padding=1, bias=False)
+
+    def forward(self, hidden_states: torch.Tensor, residual_states: torch.Tensor) -> torch.Tensor:
+        residual_states = self.out_conv1(residual_states)
+        residual_states = residual_states + hidden_states
+        residual_states = self.out_conv2(residual_states)
+        residual_states = self.batch_norm(residual_states)
+        residual_states = self.activation(residual_states)
+        residual_states = self.out_conv3(residual_states)
+        residual_states = nn.functional.interpolate(
+            residual_states, scale_factor=2.0, mode="bilinear", align_corners=False
+        )
+        return residual_states
+
+
+class EfficientLoFTRFineFusionLayer(nn.Module):
+    def __init__(self, config: EfficientLoFTRConfig):
+        super().__init__()
+
+        self.fine_kernel_size = config.fine_kernel_size
+
+        fine_fusion_dims = config.fine_fusion_dims
+        self.out_conv = nn.Conv2d(
+            fine_fusion_dims[0], fine_fusion_dims[0], kernel_size=1, stride=1, padding=0, bias=False
+        )
+        self.out_conv_layers = nn.ModuleList()
+        for i in range(1, len(fine_fusion_dims)):
+            out_conv = EfficientLoFTROutConvBlock(config, fine_fusion_dims[i], fine_fusion_dims[i - 1])
+            self.out_conv_layers.append(out_conv)
+
+    def forward_pyramid(
+        self,
+        hidden_states: torch.Tensor,
+        residual_states: list[torch.Tensor],
+    ) -> torch.Tensor:
+        hidden_states = self.out_conv(hidden_states)
+        hidden_states = nn.functional.interpolate(
+            hidden_states, scale_factor=2.0, mode="bilinear", align_corners=False
+        )
+        for i, layer in enumerate(self.out_conv_layers):
+            hidden_states = layer(hidden_states, residual_states[i])
+
+        return hidden_states
+
+    def forward(
+        self,
+        coarse_features: torch.Tensor,
+        residual_features: list[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        For each image pair, compute the fine features of pixels.
+        In both images, compute a patch of fine features center cropped around each coarse pixel.
+        In the first image, the feature patch is kernel_size large and long.
+        In the second image, it is (kernel_size + 2) large and long.
+        """
+        batch_size, _, embed_dim, coarse_height, coarse_width = coarse_features.shape
+
+        coarse_features = coarse_features.reshape(-1, embed_dim, coarse_height, coarse_width)
+        residual_features = list(reversed(residual_features))
+
+        # 1. Fine feature extraction
+        fine_features = self.forward_pyramid(coarse_features, residual_features)
+        _, fine_embed_dim, fine_height, fine_width = fine_features.shape
+
+        fine_features = fine_features.reshape(batch_size, 2, fine_embed_dim, fine_height, fine_width)
+        fine_features_0 = fine_features[:, 0]
+        fine_features_1 = fine_features[:, 1]
+
+        # 2. Unfold all local windows in crops
+        stride = int(fine_height // coarse_height)
+        fine_features_0 = nn.functional.unfold(
+            fine_features_0, kernel_size=self.fine_kernel_size, stride=stride, padding=0
+        )
+        _, _, seq_len = fine_features_0.shape
+        fine_features_0 = fine_features_0.reshape(batch_size, -1, self.fine_kernel_size**2, seq_len)
+        fine_features_0 = fine_features_0.permute(0, 3, 2, 1)
+
+        fine_features_1 = nn.functional.unfold(
+            fine_features_1, kernel_size=self.fine_kernel_size + 2, stride=stride, padding=1
+        )
+        fine_features_1 = fine_features_1.reshape(batch_size, -1, (self.fine_kernel_size + 2) ** 2, seq_len)
+        fine_features_1 = fine_features_1.permute(0, 3, 2, 1)
+
+        return fine_features_0, fine_features_1
+
+
+@auto_docstring
+class EfficientLoFTRPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = EfficientLoFTRConfig
+    base_model_prefix = "efficientloftr"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _can_record_outputs = {
+        "hidden_states": EfficientLoFTRRepVGGBlock,
+        "attentions": EfficientLoFTRAttention,
+    }
+
+    def _init_weights(self, module: nn.Module) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.Conv1d, nn.BatchNorm2d)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    # Copied from transformers.models.superpoint.modeling_superpoint.SuperPointPreTrainedModel.extract_one_channel_pixel_values with SuperPoint->EfficientLoFTR
+    def extract_one_channel_pixel_values(self, pixel_values: torch.FloatTensor) -> torch.FloatTensor:
+        """
+        Assuming pixel_values has shape (batch_size, 3, height, width), and that all channels values are the same,
+        extract the first channel value to get a tensor of shape (batch_size, 1, height, width) for EfficientLoFTR. This is
+        a workaround for the issue discussed in :
+        https://github.com/huggingface/transformers/pull/25786#issuecomment-1730176446
+
+        Args:
+            pixel_values: torch.FloatTensor of shape (batch_size, 3, height, width)
+
+        Returns:
+            pixel_values: torch.FloatTensor of shape (batch_size, 1, height, width)
+
+        """
+        return pixel_values[:, 0, :, :][:, None, :, :]
+
+
+@auto_docstring(
+    custom_intro="""
+    EfficientLoFTR model taking images as inputs and outputting the features of the images.
+    """
+)
+class EfficientLoFTRModel(EfficientLoFTRPreTrainedModel):
+    def __init__(self, config: EfficientLoFTRConfig):
+        super().__init__(config)
+
+        self.config = config
+        self.backbone = EfficientLoFTRepVGG(config)
+        self.local_feature_transformer = EfficientLoFTRLocalFeatureTransformer(config)
+        self.rotary_emb = EfficientLoFTRRotaryEmbedding(config=config)
+
+        self.post_init()
+
+    @check_model_inputs()
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        labels: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BackboneOutput:
+        r"""
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoModel
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "https://github.com/magicleap/SuperGluePretrainedNetwork/blob/master/assets/phototourism_sample_images/london_bridge_78916675_4568141288.jpg?raw=true"
+        >>> image1 = Image.open(requests.get(url, stream=True).raw)
+        >>> url = "https://github.com/magicleap/SuperGluePretrainedNetwork/blob/master/assets/phototourism_sample_images/london_bridge_19481797_2295892421.jpg?raw=true"
+        >>> image2 = Image.open(requests.get(url, stream=True).raw)
+        >>> images = [image1, image2]
+
+        >>> processor = AutoImageProcessor.from_pretrained("zju-community/efficient_loftr")
+        >>> model = AutoModel.from_pretrained("zju-community/efficient_loftr")
+
+        >>> with torch.no_grad():
+        >>>     inputs = processor(images, return_tensors="pt")
+        >>>     outputs = model(**inputs)
+        ```"""
+        if labels is not None:
+            raise ValueError("EfficientLoFTR is not trainable, no labels should be provided.")
+
+        if pixel_values.ndim != 5 or pixel_values.size(1) != 2:
+            raise ValueError("Input must be a 5D tensor of shape (batch_size, 2, num_channels, height, width)")
+
+        batch_size, _, channels, height, width = pixel_values.shape
+        pixel_values = pixel_values.reshape(batch_size * 2, channels, height, width)
+        pixel_values = self.extract_one_channel_pixel_values(pixel_values)
+
+        # 1. Local Feature CNN
+        features = self.backbone(pixel_values)
+        # Last stage outputs are coarse outputs
+        coarse_features = features[-1]
+        # Rest is residual features used in EfficientLoFTRFineFusionLayer
+        residual_features = features[:-1]
+        coarse_embed_dim, coarse_height, coarse_width = coarse_features.shape[-3:]
+
+        # 2. Coarse-level LoFTR module
+        cos, sin = self.rotary_emb(coarse_features)
+        cos = cos.expand(batch_size * 2, -1, -1, -1).reshape(batch_size * 2, -1, coarse_embed_dim)
+        sin = sin.expand(batch_size * 2, -1, -1, -1).reshape(batch_size * 2, -1, coarse_embed_dim)
+        position_embeddings = (cos, sin)
+
+        coarse_features = coarse_features.reshape(batch_size, 2, coarse_embed_dim, coarse_height, coarse_width)
+        coarse_features = self.local_feature_transformer(
+            coarse_features, position_embeddings=position_embeddings, **kwargs
+        )
+
+        features = (coarse_features,) + tuple(residual_features)
+
+        return BackboneOutput(feature_maps=features)
+
+
+def mask_border(tensor: torch.Tensor, border_margin: int, value: Union[bool, float, int]) -> torch.Tensor:
+    """
+    Mask a tensor border with a given value
+
+    Args:
+        tensor (`torch.Tensor` of shape `(batch_size, height_0, width_0, height_1, width_1)`):
+            The tensor to mask
+        border_margin (`int`) :
+            The size of the border
+        value (`Union[bool, int, float]`):
+            The value to place in the tensor's borders
+
+    Returns:
+        tensor (`torch.Tensor` of shape `(batch_size, height_0, width_0, height_1, width_1)`):
+            The masked tensor
+    """
+    if border_margin <= 0:
+        return tensor
+
+    tensor[:, :border_margin] = value
+    tensor[:, :, :border_margin] = value
+    tensor[:, :, :, :border_margin] = value
+    tensor[:, :, :, :, :border_margin] = value
+    tensor[:, -border_margin:] = value
+    tensor[:, :, -border_margin:] = value
+    tensor[:, :, :, -border_margin:] = value
+    tensor[:, :, :, :, -border_margin:] = value
+
+    return tensor
+
+
+def create_meshgrid(
+    height: Union[int, torch.Tensor],
+    width: Union[int, torch.Tensor],
+    normalized_coordinates: bool = False,
+    device: Optional[torch.device] = None,
+    dtype: Optional[torch.dtype] = None,
+) -> torch.Tensor:
+    """
+    Copied from kornia library : kornia/kornia/utils/grid.py:26
+
+    Generate a coordinate grid for an image.
+
+    When the flag ``normalized_coordinates`` is set to True, the grid is
+    normalized to be in the range :math:`[-1,1]` to be consistent with the pytorch
+    function :py:func:`torch.nn.functional.grid_sample`.
+
+    Args:
+        height (`int`):
+            The image height (rows).
+        width (`int`):
+            The image width (cols).
+        normalized_coordinates (`bool`):
+            Whether to normalize coordinates in the range :math:`[-1,1]` in order to be consistent with the
+            PyTorch function :py:func:`torch.nn.functional.grid_sample`.
+        device (`torch.device`):
+            The device on which the grid will be generated.
+        dtype (`torch.dtype`):
+            The data type of the generated grid.
+
+    Return:
+        grid (`torch.Tensor` of shape `(1, height, width, 2)`):
+            The grid tensor.
+
+    Example:
+        >>> create_meshgrid(2, 2)
+        tensor([[[[-1., -1.],
+                  [ 1., -1.]],
+        <BLANKLINE>
+                 [[-1.,  1.],
+                  [ 1.,  1.]]]])
+
+        >>> create_meshgrid(2, 2, normalized_coordinates=False)
+        tensor([[[[0., 0.],
+                  [1., 0.]],
+        <BLANKLINE>
+                 [[0., 1.],
+                  [1., 1.]]]])
+
+    """
+    xs = torch.linspace(0, width - 1, width, device=device, dtype=dtype)
+    ys = torch.linspace(0, height - 1, height, device=device, dtype=dtype)
+    if normalized_coordinates:
+        xs = (xs / (width - 1) - 0.5) * 2
+        ys = (ys / (height - 1) - 0.5) * 2
+    grid = torch.stack(torch.meshgrid(ys, xs, indexing="ij"), dim=-1)
+    grid = grid.permute(1, 0, 2).unsqueeze(0)
+    return grid
+
+
+def spatial_expectation2d(input: torch.Tensor, normalized_coordinates: bool = True) -> torch.Tensor:
+    r"""
+    Copied from kornia library : kornia/geometry/subpix/dsnt.py:76
+    Compute the expectation of coordinate values using spatial probabilities.
+
+    The input heatmap is assumed to represent a valid spatial probability distribution,
+    which can be achieved using :func:`~kornia.geometry.subpixel.spatial_softmax2d`.
+
+    Args:
+        input (`torch.Tensor` of shape `(batch_size, embed_dim, height, width)`):
+            The input tensor representing dense spatial probabilities.
+        normalized_coordinates (`bool`):
+            Whether to return the coordinates normalized in the range of :math:`[-1, 1]`. Otherwise, it will return
+            the coordinates in the range of the input shape.
+
+    Returns:
+        output (`torch.Tensor` of shape `(batch_size, embed_dim, 2)`)
+            Expected value of the 2D coordinates. Output order of the coordinates is (x, y).
+
+    Examples:
+        >>> heatmaps = torch.tensor([[[
+        ... [0., 0., 0.],
+        ... [0., 0., 0.],
+        ... [0., 1., 0.]]]])
+        >>> spatial_expectation2d(heatmaps, False)
+        tensor([[[1., 2.]]])
+
+    """
+    batch_size, embed_dim, height, width = input.shape
+
+    # Create coordinates grid.
+    grid = create_meshgrid(height, width, normalized_coordinates, input.device)
+    grid = grid.to(input.dtype)
+
+    pos_x = grid[..., 0].reshape(-1)
+    pos_y = grid[..., 1].reshape(-1)
+
+    input_flat = input.view(batch_size, embed_dim, -1)
+
+    # Compute the expectation of the coordinates.
+    expected_y = torch.sum(pos_y * input_flat, -1, keepdim=True)
+    expected_x = torch.sum(pos_x * input_flat, -1, keepdim=True)
+
+    output = torch.cat([expected_x, expected_y], -1)
+
+    return output.view(batch_size, embed_dim, 2)
+
+
+@auto_docstring(
+    custom_intro="""
+    EfficientLoFTR model taking images as inputs and outputting the matching of them.
+    """
+)
+class EfficientLoFTRForKeypointMatching(EfficientLoFTRPreTrainedModel):
+    """EfficientLoFTR dense image matcher
+
+    Given two images, we determine the correspondences by:
+      1. Extracting coarse and fine features through a backbone
+      2. Transforming coarse features through self and cross attention
+      3. Matching coarse features to obtain coarse coordinates of matches
+      4. Obtaining full resolution fine features by fusing transformed and backbone coarse features
+      5. Refining the coarse matches using fine feature patches centered at each coarse match in a two-stage refinement
+
+    Yifan Wang, Xingyi He, Sida Peng, Dongli Tan and Xiaowei Zhou.
+    Efficient LoFTR: Semi-Dense Local Feature Matching with Sparse-Like Speed
+    In CVPR, 2024. https://huggingface.co/papers/2403.04765
+    """
+
+    def __init__(self, config: EfficientLoFTRConfig):
+        super().__init__(config)
+
+        self.config = config
+        self.efficientloftr = EfficientLoFTRModel(config)
+        self.refinement_layer = EfficientLoFTRFineFusionLayer(config)
+
+        self.post_init()
+
+    def _get_matches_from_scores(self, scores: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Based on a keypoint score matrix, compute the best keypoint matches between the first and second image.
+        Since each image pair can have different number of matches, the matches are concatenated together for all pair
+        in the batch and a batch_indices tensor is returned to specify which match belong to which element in the batch.
+
+        Note:
+            This step can be done as a postprocessing step, because does not involve any model weights/params.
+            However, we keep it in the modeling code for consistency with other keypoint matching models AND for
+            easier torch.compile/torch.export (all ops are in torch).
+
+        Args:
+            scores (`torch.Tensor` of shape `(batch_size, height_0, width_0, height_1, width_1)`):
+                Scores of keypoints
+
+        Returns:
+            matched_indices (`torch.Tensor` of shape `(2, num_matches)`):
+                Indices representing which pixel in the first image matches which pixel in the second image
+            matching_scores (`torch.Tensor` of shape `(num_matches,)`):
+                Scores of each match
+        """
+        batch_size, height0, width0, height1, width1 = scores.shape
+
+        scores = scores.view(batch_size, height0 * width0, height1 * width1)
+
+        # For each keypoint, get the best match
+        max_0 = scores.max(2, keepdim=True).values
+        max_1 = scores.max(1, keepdim=True).values
+
+        # 1. Thresholding
+        mask = scores > self.config.coarse_matching_threshold
+
+        # 2. Border removal
+        mask = mask.reshape(batch_size, height0, width0, height1, width1)
+        mask = mask_border(mask, self.config.coarse_matching_border_removal, False)
+        mask = mask.reshape(batch_size, height0 * width0, height1 * width1)
+
+        # 3. Mutual nearest neighbors
+        mask = mask * (scores == max_0) * (scores == max_1)
+
+        # 4. Fine coarse matches
+        masked_scores = scores * mask
+        matching_scores_0, max_indices_0 = masked_scores.max(1)
+        matching_scores_1, max_indices_1 = masked_scores.max(2)
+
+        matching_indices = torch.cat([max_indices_0, max_indices_1]).reshape(batch_size, 2, -1)
+        matching_scores = torch.stack([matching_scores_0, matching_scores_1], dim=1)
+
+        # For the keypoints not meeting the threshold score, set the indices to -1 which corresponds to no matches found
+        matching_indices = torch.where(matching_scores > 0, matching_indices, -1)
+
+        return matching_indices, matching_scores
+
+    def _coarse_matching(
+        self, coarse_features: torch.Tensor, coarse_scale: float
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        For each image pair, compute the matching confidence between each coarse element (by default (image_height / 8)
+        * (image_width / 8 elements)) from the first image to the second image.
+
+        Note:
+            This step can be done as a postprocessing step, because does not involve any model weights/params.
+            However, we keep it in the modeling code for consistency with other keypoint matching models AND for
+            easier torch.compile/torch.export (all ops are in torch).
+
+        Args:
+            coarse_features (`torch.Tensor` of shape `(batch_size, 2, hidden_size, coarse_height, coarse_width)`):
+                Coarse features
+            coarse_scale (`float`): Scale between the image size and the coarse size
+
+        Returns:
+            keypoints (`torch.Tensor` of shape `(batch_size, 2, num_matches, 2)`):
+                Keypoints coordinates.
+            matching_scores (`torch.Tensor` of shape `(batch_size, 2, num_matches)`):
+                The confidence matching score of each keypoint.
+            matched_indices (`torch.Tensor` of shape `(batch_size, 2, num_matches)`):
+                Indices which indicates which keypoint in an image matched with which keypoint in the other image. For
+                both image in the pair.
+        """
+        batch_size, _, embed_dim, height, width = coarse_features.shape
+
+        # (batch_size, 2, embed_dim, height, width) -> (batch_size, 2, height * width, embed_dim)
+        coarse_features = coarse_features.permute(0, 1, 3, 4, 2)
+        coarse_features = coarse_features.reshape(batch_size, 2, -1, embed_dim)
+
+        coarse_features = coarse_features / coarse_features.shape[-1] ** 0.5
+        coarse_features_0 = coarse_features[:, 0]
+        coarse_features_1 = coarse_features[:, 1]
+
+        similarity = coarse_features_0 @ coarse_features_1.transpose(-1, -2)
+        similarity = similarity / self.config.coarse_matching_temperature
+
+        if self.config.coarse_matching_skip_softmax:
+            confidence = similarity
+        else:
+            confidence = nn.functional.softmax(similarity, 1) * nn.functional.softmax(similarity, 2)
+
+        confidence = confidence.view(batch_size, height, width, height, width)
+        matched_indices, matching_scores = self._get_matches_from_scores(confidence)
+
+        keypoints = torch.stack([matched_indices % width, matched_indices // width], dim=-1) * coarse_scale
+
+        return keypoints, matching_scores, matched_indices
+
+    def _get_first_stage_fine_matching(
+        self,
+        fine_confidence: torch.Tensor,
+        coarse_matched_keypoints: torch.Tensor,
+        fine_window_size: int,
+        fine_scale: float,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        For each coarse pixel, retrieve the highest fine confidence score and index.
+        The index represents the matching between a pixel position in the fine window in the first image and a pixel
+        position in the fine window of the second image.
+        For example, for a fine_window_size of 64 (8 * 8), the index 2474 represents the matching between the index 38
+        (2474 // 64) in the fine window of the first image, and the index 42 in the second image. This means that 38
+        which corresponds to the position (4, 6) (4 // 8 and 4 % 8) is matched with the position (5, 2). In this example
+        the coarse matched coordinate will be shifted to the matched fine coordinates in the first and second image.
+
+        Note:
+            This step can be done as a postprocessing step, because does not involve any model weights/params.
+            However, we keep it in the modeling code for consistency with other keypoint matching models AND for
+            easier torch.compile/torch.export (all ops are in torch).
+
+        Args:
+            fine_confidence (`torch.Tensor` of shape `(num_matches, fine_window_size, fine_window_size)`):
+                First stage confidence of matching fine features between the first and the second image
+            coarse_matched_keypoints (`torch.Tensor` of shape `(2, num_matches, 2)`):
+                Coarse matched keypoint between the first and the second image.
+            fine_window_size (`int`):
+                Size of the window used to refine matches
+            fine_scale (`float`):
+                Scale between the size of fine features and coarse features
+
+        Returns:
+            indices (`torch.Tensor` of shape `(2, num_matches, 1)`):
+                Indices of the fine coordinate matched in the fine window
+            fine_matches (`torch.Tensor` of shape `(2, num_matches, 2)`):
+                Coordinates of matched keypoints after the first fine stage
+        """
+        batch_size, num_keypoints, _, _ = fine_confidence.shape
+        fine_kernel_size = torch_int(fine_window_size**0.5)
+
+        fine_confidence = fine_confidence.reshape(batch_size, num_keypoints, -1)
+        values, indices = torch.max(fine_confidence, dim=-1)
+        indices = indices[..., None]
+        indices_0 = indices // fine_window_size
+        indices_1 = indices % fine_window_size
+
+        grid = create_meshgrid(
+            fine_kernel_size,
+            fine_kernel_size,
+            normalized_coordinates=False,
+            device=fine_confidence.device,
+            dtype=fine_confidence.dtype,
+        )
+        grid = grid - (fine_kernel_size // 2) + 0.5
+        grid = grid.reshape(1, 1, -1, 2).expand(batch_size, num_keypoints, -1, -1)
+        delta_0 = torch.gather(grid, 1, indices_0.unsqueeze(-1).expand(-1, -1, -1, 2)).squeeze(2)
+        delta_1 = torch.gather(grid, 1, indices_1.unsqueeze(-1).expand(-1, -1, -1, 2)).squeeze(2)
+
+        fine_matches_0 = coarse_matched_keypoints[:, 0] + delta_0 * fine_scale
+        fine_matches_1 = coarse_matched_keypoints[:, 1] + delta_1 * fine_scale
+
+        indices = torch.stack([indices_0, indices_1], dim=1)
+        fine_matches = torch.stack([fine_matches_0, fine_matches_1], dim=1)
+
+        return indices, fine_matches
+
+    def _get_second_stage_fine_matching(
+        self,
+        indices: torch.Tensor,
+        fine_matches: torch.Tensor,
+        fine_confidence: torch.Tensor,
+        fine_window_size: int,
+        fine_scale: float,
+    ) -> torch.Tensor:
+        """
+        For the given position in their respective fine windows, retrieve the 3x3 fine confidences around this position.
+        After applying softmax to these confidences, compute the 2D spatial expected coordinates.
+        Shift the first stage fine matching with these expected coordinates.
+
+        Note:
+            This step can be done as a postprocessing step, because does not involve any model weights/params.
+            However, we keep it in the modeling code for consistency with other keypoint matching models AND for
+            easier torch.compile/torch.export (all ops are in torch).
+
+        Args:
+            indices (`torch.Tensor` of shape `(batch_size, 2, num_keypoints)`):
+                Indices representing the position of each keypoint in the fine window
+            fine_matches (`torch.Tensor` of shape `(2, num_matches, 2)`):
+                Coordinates of matched keypoints after the first fine stage
+            fine_confidence (`torch.Tensor` of shape `(num_matches, fine_window_size, fine_window_size)`):
+                Second stage confidence of matching fine features between the first and the second image
+            fine_window_size (`int`):
+                Size of the window used to refine matches
+            fine_scale (`float`):
+                Scale between the size of fine features and coarse features
+
+        Returns:
+            fine_matches (`torch.Tensor` of shape `(2, num_matches, 2)`):
+                Coordinates of matched keypoints after the second fine stage
+        """
+        batch_size, num_keypoints, _, _ = fine_confidence.shape
+        fine_kernel_size = torch_int(fine_window_size**0.5)
+
+        indices_0 = indices[:, 0]
+        indices_1 = indices[:, 1]
+        indices_1_i = indices_1 // fine_kernel_size
+        indices_1_j = indices_1 % fine_kernel_size
+
+        # matches_indices, indices_0, indices_1_i, indices_1_j of shape (num_matches, 3, 3)
+        batch_indices = torch.arange(batch_size, device=indices_0.device).reshape(batch_size, 1, 1, 1)
+        matches_indices = torch.arange(num_keypoints, device=indices_0.device).reshape(1, num_keypoints, 1, 1)
+        indices_0 = indices_0[..., None]
+        indices_1_i = indices_1_i[..., None]
+        indices_1_j = indices_1_j[..., None]
+
+        delta = create_meshgrid(3, 3, normalized_coordinates=True, device=indices_0.device).to(torch.long)
+        delta = delta[None, ...]
+
+        indices_1_i = indices_1_i + delta[..., 1]
+        indices_1_j = indices_1_j + delta[..., 0]
+
+        fine_confidence = fine_confidence.reshape(
+            batch_size, num_keypoints, fine_window_size, fine_kernel_size + 2, fine_kernel_size + 2
+        )
+        # (batch_size, seq_len, fine_window_size, fine_kernel_size + 2, fine_kernel_size + 2) -> (batch_size, seq_len, 3, 3)
+        fine_confidence = fine_confidence[batch_indices, matches_indices, indices_0, indices_1_i, indices_1_j]
+        fine_confidence = fine_confidence.reshape(batch_size, num_keypoints, 9)
+        fine_confidence = nn.functional.softmax(
+            fine_confidence / self.config.fine_matching_regress_temperature, dim=-1
+        )
+
+        heatmap = fine_confidence.reshape(batch_size, num_keypoints, 3, 3)
+        fine_coordinates_normalized = spatial_expectation2d(heatmap, True)[0]
+
+        fine_matches_0 = fine_matches[:, 0]
+        fine_matches_1 = fine_matches[:, 1] + (fine_coordinates_normalized * (3 // 2) * fine_scale)
+
+        fine_matches = torch.stack([fine_matches_0, fine_matches_1], dim=1)
+
+        return fine_matches
+
+    def _fine_matching(
+        self,
+        fine_features_0: torch.Tensor,
+        fine_features_1: torch.Tensor,
+        coarse_matched_keypoints: torch.Tensor,
+        fine_scale: float,
+    ) -> torch.Tensor:
+        """
+        For each coarse pixel with a corresponding window of fine features, compute the matching confidence between fine
+        features in the first image and the second image.
+
+        Fine features are sliced in two part :
+        - The first part used for the first stage are the first fine_hidden_size - config.fine_matching_slicedim (64 - 8
+         = 56 by default) features.
+        - The second part used for the second stage are the last config.fine_matching_slicedim (8 by default) features.
+
+        Each part is used to compute a fine confidence tensor of the following shape :
+        (batch_size, (coarse_height * coarse_width), fine_window_size, fine_window_size)
+        They correspond to the score between each fine pixel in the first image and each fine pixel in the second image.
+
+        Args:
+            fine_features_0 (`torch.Tensor` of shape `(num_matches, fine_kernel_size ** 2, fine_kernel_size ** 2)`):
+                Fine features from the first image
+            fine_features_1 (`torch.Tensor` of shape `(num_matches, (fine_kernel_size + 2) ** 2, (fine_kernel_size + 2)
+            ** 2)`):
+                Fine features from the second image
+            coarse_matched_keypoints (`torch.Tensor` of shape `(2, num_matches, 2)`):
+                Keypoint coordinates found in coarse matching for the first and second image
+            fine_scale (`int`):
+                Scale between the size of fine features and coarse features
+
+        Returns:
+            fine_coordinates (`torch.Tensor` of shape `(2, num_matches, 2)`):
+                Matched keypoint between the first and the second image. All matched keypoints are concatenated in the
+                second dimension.
+
+        """
+        batch_size, num_keypoints, fine_window_size, fine_embed_dim = fine_features_0.shape
+        fine_matching_slice_dim = self.config.fine_matching_slice_dim
+
+        fine_kernel_size = torch_int(fine_window_size**0.5)
+
+        # Split fine features into first and second stage features
+        split_fine_features_0 = torch.split(fine_features_0, fine_embed_dim - fine_matching_slice_dim, -1)
+        split_fine_features_1 = torch.split(fine_features_1, fine_embed_dim - fine_matching_slice_dim, -1)
+
+        # Retrieve first stage fine features
+        fine_features_0 = split_fine_features_0[0]
+        fine_features_1 = split_fine_features_1[0]
+
+        # Normalize first stage fine features
+        fine_features_0 = fine_features_0 / fine_features_0.shape[-1] ** 0.5
+        fine_features_1 = fine_features_1 / fine_features_1.shape[-1] ** 0.5
+
+        # Compute first stage confidence
+        fine_confidence = fine_features_0 @ fine_features_1.transpose(-1, -2)
+        fine_confidence = nn.functional.softmax(fine_confidence, 1) * nn.functional.softmax(fine_confidence, 2)
+        fine_confidence = fine_confidence.reshape(
+            batch_size, num_keypoints, fine_window_size, fine_kernel_size + 2, fine_kernel_size + 2
+        )
+        fine_confidence = fine_confidence[..., 1:-1, 1:-1]
+        first_stage_fine_confidence = fine_confidence.reshape(
+            batch_size, num_keypoints, fine_window_size, fine_window_size
+        )
+
+        fine_indices, fine_matches = self._get_first_stage_fine_matching(
+            first_stage_fine_confidence,
+            coarse_matched_keypoints,
+            fine_window_size,
+            fine_scale,
+        )
+
+        # Retrieve second stage fine features
+        fine_features_0 = split_fine_features_0[1]
+        fine_features_1 = split_fine_features_1[1]
+
+        # Normalize second stage fine features
+        fine_features_1 = fine_features_1 / fine_matching_slice_dim**0.5
+
+        # Compute second stage fine confidence
+        second_stage_fine_confidence = fine_features_0 @ fine_features_1.transpose(-1, -2)
+
+        fine_coordinates = self._get_second_stage_fine_matching(
+            fine_indices,
+            fine_matches,
+            second_stage_fine_confidence,
+            fine_window_size,
+            fine_scale,
+        )
+
+        return fine_coordinates
+
+    @auto_docstring
+    @can_return_tuple
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        labels: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> KeypointMatchingOutput:
+        r"""
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoModel
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "https://github.com/magicleap/SuperGluePretrainedNetwork/blob/master/assets/phototourism_sample_images/london_bridge_78916675_4568141288.jpg?raw=true"
+        >>> image1 = Image.open(requests.get(url, stream=True).raw)
+        >>> url = "https://github.com/magicleap/SuperGluePretrainedNetwork/blob/master/assets/phototourism_sample_images/london_bridge_19481797_2295892421.jpg?raw=true"
+        >>> image2 = Image.open(requests.get(url, stream=True).raw)
+        >>> images = [image1, image2]
+
+        >>> processor = AutoImageProcessor.from_pretrained("zju-community/efficient_loftr")
+        >>> model = AutoModel.from_pretrained("zju-community/efficient_loftr")
+
+        >>> with torch.no_grad():
+        >>>     inputs = processor(images, return_tensors="pt")
+        >>>     outputs = model(**inputs)
+        ```"""
+        if labels is not None:
+            raise ValueError("SuperGlue is not trainable, no labels should be provided.")
+
+        # 1. Extract coarse and residual features
+        model_outputs: BackboneOutput = self.efficientloftr(pixel_values, **kwargs)
+        features = model_outputs.feature_maps
+
+        # 2. Compute coarse-level matching
+        coarse_features = features[0]
+        coarse_embed_dim, coarse_height, coarse_width = coarse_features.shape[-3:]
+        batch_size, _, channels, height, width = pixel_values.shape
+        coarse_scale = height / coarse_height
+        coarse_keypoints, coarse_matching_scores, coarse_matched_indices = self._coarse_matching(
+            coarse_features, coarse_scale
+        )
+
+        # 3. Fine-level refinement
+        residual_features = features[1:]
+        coarse_features = coarse_features / self.config.hidden_size**0.5
+        fine_features_0, fine_features_1 = self.refinement_layer(coarse_features, residual_features)
+
+        # Filter fine features with coarse matches indices
+        _, _, num_keypoints = coarse_matching_scores.shape
+        batch_indices = torch.arange(batch_size)[..., None]
+        fine_features_0 = fine_features_0[batch_indices, coarse_matched_indices[:, 0]]
+        fine_features_1 = fine_features_1[batch_indices, coarse_matched_indices[:, 1]]
+
+        # 4. Computer fine-level matching
+        fine_height = torch_int(coarse_height * coarse_scale)
+        fine_scale = height / fine_height
+        matching_keypoints = self._fine_matching(fine_features_0, fine_features_1, coarse_keypoints, fine_scale)
+
+        matching_keypoints[:, :, :, 0] = matching_keypoints[:, :, :, 0] / width
+        matching_keypoints[:, :, :, 1] = matching_keypoints[:, :, :, 1] / height
+
+        return KeypointMatchingOutput(
+            matches=coarse_matched_indices,
+            matching_scores=coarse_matching_scores,
+            keypoints=matching_keypoints,
+            hidden_states=model_outputs.hidden_states,
+            attentions=model_outputs.attentions,
+        )
+
+
+__all__ = ["EfficientLoFTRPreTrainedModel", "EfficientLoFTRModel", "EfficientLoFTRForKeypointMatching"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/fuyu/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/fuyu/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2a7d252010e00ec7e3192520ac401b200dc1da9
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/fuyu/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_fuyu import *
+    from .image_processing_fuyu import *
+    from .modeling_fuyu import *
+    from .processing_fuyu import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/fuyu/configuration_fuyu.py b/venv/lib/python3.13/site-packages/transformers/models/fuyu/configuration_fuyu.py
new file mode 100644
index 0000000000000000000000000000000000000000..40da84e2e780821f26765333a2cee51030e0bea4
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/fuyu/configuration_fuyu.py
@@ -0,0 +1,215 @@
+# coding=utf-8
+# Copyright 2023 Adept AI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fuyu model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class FuyuConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FuyuForCausalLM`]. It is used to instantiate an
+    Fuyu model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the
+    [adept/fuyu-8b](https://huggingface.co/adept/fuyu-8b).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 262144):
+            Vocabulary size of the Fuyu model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`FuyuForCausalLM`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 16384):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 36):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 64):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 16384):
+            The maximum sequence length that this model might ever be used with.
+        image_size (`int`, *optional*, defaults to 300):
+            The input image size.
+        patch_size (`int`, *optional*, defaults to 30):
+            The input vision transformer encoding patch size.
+        num_channels (`int`, *optional*, defaults to 3):
+            The input image number of channels.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`. Whether to tie weight embeddings
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie input and output embeddings.
+        rope_theta (`float`, *optional*, defaults to 25000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalFuyu/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        qk_layernorm (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the Queries and Keys after projecting the hidden states
+        hidden_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio after applying the MLP to the hidden states.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio after computing the attention scores.
+        partial_rotary_factor (`float`, *optional*, defaults to 0.5):
+            Percentage of the query and keys which will have rotary embedding.
+
+        pad_token_id (`int`, *optional*):
+            The id of the *padding* token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the *beginning-of-sequence* token.
+        eos_token_id (`Union[int, list[int]]`, *optional*, defaults to 2):
+            The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+        image_token_id (`int`, *optional*, defaults to 71011):
+            The id of the image placeholder token.
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize the `language``[`Aut`].
+
+    ```python
+    >>> from transformers import FuyuConfig
+
+    >>> # Initializing a Fuyu fuyu-7b style configuration
+    >>> configuration = FuyuConfig()
+    ```"""
+
+    model_type = "fuyu"
+    sub_configs = {"text_config": AutoConfig}
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=262144,
+        hidden_size=4096,
+        intermediate_size=16384,
+        num_hidden_layers=36,
+        num_attention_heads=64,
+        hidden_act="relu2",
+        max_position_embeddings=16384,
+        image_size=300,
+        patch_size=30,
+        num_channels=3,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=25000.0,
+        rope_scaling=None,
+        qk_layernorm=True,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        partial_rotary_factor=0.5,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        image_token_id=71011,
+        text_config=None,
+        **kwargs,
+    ):
+        if text_config is None:
+            text_config = {
+                "vocab_size": vocab_size,
+                "max_position_embeddings": max_position_embeddings,
+                "hidden_size": hidden_size,
+                "intermediate_size": intermediate_size,
+                "num_hidden_layers": num_hidden_layers,
+                "num_attention_heads": num_attention_heads,
+                "hidden_act": hidden_act,
+                "initializer_range": initializer_range,
+                "layer_norm_eps": layer_norm_eps,
+                "use_cache": use_cache,
+                "rope_theta": rope_theta,
+                "rope_scaling": rope_scaling,
+                "qk_layernorm": qk_layernorm,
+                "hidden_dropout": hidden_dropout,
+                "attention_dropout": attention_dropout,
+                "partial_rotary_factor": partial_rotary_factor,
+                "pad_token_id": pad_token_id,
+                "bos_token_id": bos_token_id,
+                "eos_token_id": eos_token_id,
+                "tie_word_embeddings": tie_word_embeddings,
+            }
+            logger.info("text_config is None. initializing the text model with default values.")
+        text_model_type = text_config.get("model_type", "persimmon")
+        self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
+
+        self._vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.qk_layernorm = qk_layernorm
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.partial_rotary_factor = partial_rotary_factor
+        self.image_token_id = image_token_id
+        self._rope_scaling_validation()
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                f"`rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
+
+
+__all__ = ["FuyuConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/fuyu/image_processing_fuyu.py b/venv/lib/python3.13/site-packages/transformers/models/fuyu/image_processing_fuyu.py
new file mode 100644
index 0000000000000000000000000000000000000000..366782be16f486d27f8c1f803685935a55909554
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/fuyu/image_processing_fuyu.py
@@ -0,0 +1,728 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Fuyu."""
+
+import math
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    pad,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    is_valid_image,
+    make_list_of_images,
+    to_numpy_array,
+    validate_preprocess_arguments,
+)
+from ...utils import (
+    TensorType,
+    filter_out_non_signature_kwargs,
+    is_torch_available,
+    is_torch_device,
+    is_torch_dtype,
+    logging,
+    requires_backends,
+)
+
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.get_logger(__name__)
+
+
+def make_list_of_list_of_images(
+    images: Union[list[list[ImageInput]], list[ImageInput], ImageInput],
+) -> list[list[ImageInput]]:
+    if is_valid_image(images):
+        return [[images]]
+
+    if isinstance(images, list) and all(isinstance(image, list) for image in images):
+        return images
+
+    if isinstance(images, list):
+        return [make_list_of_images(image) for image in images]
+
+    raise ValueError("images must be a list of list of images or a list of images or an image.")
+
+
+class FuyuBatchFeature(BatchFeature):
+    """
+    BatchFeature class for Fuyu image processor and processor.
+
+    The outputs dictionary from the processors contains a mix of tensors and lists of tensors.
+    """
+
+    def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
+        """
+        Convert the inner content to tensors.
+
+        Args:
+            tensor_type (`str` or [`~utils.TensorType`], *optional*):
+                The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
+                `None`, no modification is done.
+        """
+        if tensor_type is None:
+            return self
+
+        is_tensor, as_tensor = self._get_is_as_tensor_fns(tensor_type=tensor_type)
+
+        def _convert_tensor(elem):
+            if is_tensor(elem):
+                return elem
+            return as_tensor(elem)
+
+        def _safe_convert_tensor(elem):
+            try:
+                return _convert_tensor(elem)
+            except:  # noqa E722
+                if key == "overflowing_values":
+                    raise ValueError("Unable to create tensor returning overflowing values of different lengths. ")
+                raise ValueError(
+                    "Unable to create tensor, you should probably activate padding "
+                    "with 'padding=True' to have batched tensors with the same length."
+                )
+
+        # Do the tensor conversion in batch
+        for key, value in self.items():
+            if isinstance(value, list) and isinstance(value[0], list):
+                # list[list[Any]] -> list[list[Tensor]]
+                self[key] = [[_safe_convert_tensor(elem) for elem in elems] for elems in value]
+            elif isinstance(value, list):
+                # list[Any] -> list[Tensor]
+                self[key] = [_safe_convert_tensor(elem) for elem in value]
+            else:
+                # Any -> Tensor
+                self[key] = _safe_convert_tensor(value)
+        return self
+
+    def to(self, *args, **kwargs) -> "BatchFeature":
+        """
+        Send all values to device by calling `v.to(*args, **kwargs)` (PyTorch only). This should support casting in
+        different `dtypes` and sending the `BatchFeature` to a different `device`.
+
+        Args:
+            args (`Tuple`):
+                Will be passed to the `to(...)` function of the tensors.
+            kwargs (`Dict`, *optional*):
+                Will be passed to the `to(...)` function of the tensors.
+
+        Returns:
+            [`BatchFeature`]: The same instance after modification.
+        """
+        requires_backends(self, ["torch"])
+        import torch
+
+        new_data = {}
+        device = kwargs.get("device")
+        # Check if the args are a device or a dtype
+        if device is None and len(args) > 0:
+            # device should be always the first argument
+            arg = args[0]
+            if is_torch_dtype(arg):
+                # The first argument is a dtype
+                pass
+            elif isinstance(arg, str) or is_torch_device(arg) or isinstance(arg, int):
+                device = arg
+            else:
+                # it's something else
+                raise ValueError(f"Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.")
+
+        def _to(elem):
+            # check if v is a floating point
+            if torch.is_floating_point(elem):
+                # cast and send to device
+                return elem.to(*args, **kwargs)
+            if device is not None:
+                return elem.to(device=device)
+
+            return elem
+
+        # We cast only floating point tensors to avoid issues with tokenizers casting `LongTensor` to `FloatTensor`
+        for k, v in self.items():
+            if isinstance(v, list) and isinstance(v[0], list):
+                # Data structure is a list of lists
+                new_v = []
+                for elems in v:
+                    new_v.append([_to(elem) for elem in elems])
+                new_data[k] = new_v
+            elif isinstance(v, list):
+                # Data structure is a list
+                new_data[k] = [_to(elem) for elem in v]
+            else:
+                new_data[k] = _to(v)
+        self.data = new_data
+        return self
+
+
+class FuyuImageProcessor(BaseImageProcessor):
+    """
+    This class should handle the image processing part before the main FuyuForCausalLM. In particular, it should
+    handle:
+
+    - Processing Images:
+        Taking a batch of images as input. If the images are variable-sized, it resizes them based on the desired patch
+        dimensions. The image output is always img_h, img_w of (1080, 1920)
+
+        Then, it patches up these images using the patchify_image function.
+
+    - Creating Image Input IDs:
+        For each patch, a placeholder ID is given to identify where these patches belong in a token sequence. For
+        variable-sized images, each line of patches is terminated with a newline ID.
+
+    - Image Patch Indices:
+        For each image patch, the code maintains an index where these patches should be inserted in a token stream.
+
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image to `size`.
+        size (`dict[str, int]`, *optional*, defaults to `{"height": 1080, "width": 1920}`):
+            Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Whether to pad the image to `size`.
+        padding_value (`float`, *optional*, defaults to 1.0):
+            The value to pad the image with.
+        padding_mode (`str`, *optional*, defaults to `"constant"`):
+            The padding mode to use when padding the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image.
+        image_mean (`float`, *optional*, defaults to 0.5):
+            The mean to use when normalizing the image.
+        image_std (`float`, *optional*, defaults to 0.5):
+            The standard deviation to use when normalizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image.
+        rescale_factor (`float`, *optional*, defaults to `1 / 255`):
+            The factor to use when rescaling the image.
+        patch_size (`dict[str, int]`, *optional*, defaults to `{"height": 30, "width": 30}`):
+            Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
+    """
+
+    model_input_names = [
+        "images",
+        "image_input_ids",
+        "image_patches",
+        "image_patch_indices_per_batch",
+        "image_patch_indices_per_subsequence",
+    ]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_pad: bool = True,
+        padding_value: float = 1.0,
+        padding_mode: str = "constant",
+        do_normalize: bool = True,
+        image_mean: Union[float, list[float]] = 0.5,
+        image_std: Union[float, list[float]] = 0.5,
+        do_rescale: bool = True,
+        rescale_factor: float = 1 / 255,
+        patch_size: Optional[dict[str, int]] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.size = size if size is not None else {"height": 1080, "width": 1920}
+        self.resample = resample
+        self.do_pad = do_pad
+        self.padding_value = padding_value
+        self.padding_mode = padding_mode
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.patch_size = patch_size if patch_size is not None else {"height": 30, "width": 30}
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        image_height, image_width = get_image_size(image, input_data_format)
+        target_height, target_width = size["height"], size["width"]
+
+        if image_width <= target_width and image_height <= target_height:
+            return image
+
+        height_scale_factor = target_height / image_height
+        width_scale_factor = target_width / image_width
+        optimal_scale_factor = min(height_scale_factor, width_scale_factor)
+
+        new_height = int(image_height * optimal_scale_factor)
+        new_width = int(image_width * optimal_scale_factor)
+
+        scaled_image = resize(
+            image=image,
+            size=(new_height, new_width),
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+        return scaled_image
+
+    def pad_image(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        mode: str = "constant",
+        constant_values: float = 1.0,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Pad an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to pad.
+            size (`dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The data format of the output image. If unset, the same format as the input image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        image_height, image_width = get_image_size(image, input_data_format)
+        target_height, target_width = size["height"], size["width"]
+        padding_top = 0
+        padding_left = 0
+        padding_bottom = target_height - image_height
+        padding_right = target_width - image_width
+        padded_image = pad(
+            image,
+            padding=((padding_top, padding_bottom), (padding_left, padding_right)),
+            mode=mode,
+            constant_values=constant_values,
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
+        return padded_image
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_pad: Optional[bool] = None,
+        padding_value: Optional[float] = None,
+        padding_mode: Optional[str] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[float] = None,
+        image_std: Optional[float] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        patch_size: Optional[dict[str, int]] = None,
+        data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        return_tensors: Optional[TensorType] = None,
+    ):
+        """
+
+        Utility function to preprocess the images and extract necessary information about original formats.
+
+        Args:
+            images (`ImageInput`):
+                Images to preprocess. Expects a single image, a list or images or a list of lists of images. Pixel
+                values range from 0 to 255, or between 0 and 1 if `do_rescale` is `False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image to `size`.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            do_pad (`bool`, *optional*, defaults to `self.do_pad`):
+                Whether to pad the image to `size`.
+            padding_value (`float`, *optional*, defaults to `self.padding_value`):
+                The value to pad the image with.
+            padding_mode (`str`, *optional*, defaults to `self.padding_mode`):
+                The padding mode to use when padding the image.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float`, *optional*, defaults to `self.image_mean`):
+                The mean to use when normalizing the image.
+            image_std (`float`, *optional*, defaults to `self.image_std`):
+                The standard deviation to use when normalizing the image.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                The factor to use when rescaling the image.
+            patch_size (`dict[str, int]`, *optional*, defaults to `self.patch_size`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format of the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        """
+
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        resample = resample if resample is not None else self.resample
+        do_pad = do_pad if do_pad is not None else self.do_pad
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        padding_value = padding_value if padding_value is not None else self.padding_value
+        padding_mode = padding_mode if padding_mode is not None else self.padding_mode
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        patch_size = patch_size if patch_size is not None else self.patch_size
+
+        if isinstance(images, list) and any(isinstance(elem, list) and len(elem) >= 2 for elem in images):
+            raise ValueError("Multiple images for a single sample are not yet supported.")
+
+        batch_images = make_list_of_list_of_images(images)
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        # All transformations expect numpy arrays.
+        batch_images = [[to_numpy_array(image) for image in images] for images in batch_images]
+
+        # Search for the first image in the image list.
+        # NOTE: we can't slice the first image with images_list[0][0] if the first batch contains no images. See #36682
+        first_image_in_list = [images for images in batch_images if images][0][0]
+
+        if do_rescale and is_scaled_image(first_image_in_list):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(first_image_in_list)
+
+        original_image_sizes = [
+            get_image_size(images[0], channel_dim=input_data_format) for images in batch_images if images
+        ]
+        size = get_size_dict(size)  # for BC
+
+        if do_resize:
+            batch_images = [
+                [self.resize(image, size=size, input_data_format=input_data_format) for image in images]
+                for images in batch_images
+            ]
+
+        image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images if images]
+        image_unpadded_heights = [[image_size[0]] for image_size in image_sizes]
+        image_unpadded_widths = [[image_size[1]] for image_size in image_sizes]
+
+        # scale_h is the same as scale_w
+        image_scale_factors = [
+            [resized_size[0] / original_size[0]]
+            for original_size, resized_size in zip(original_image_sizes, image_sizes)
+        ]
+
+        if do_pad:
+            batch_images = [
+                [
+                    self.pad_image(
+                        image,
+                        size=size,
+                        mode=padding_mode,
+                        constant_values=padding_value,
+                        input_data_format=input_data_format,
+                    )
+                    for image in images
+                ]
+                for images in batch_images
+            ]
+
+        if do_rescale:
+            batch_images = [
+                [self.rescale(image, scale=rescale_factor, input_data_format=input_data_format) for image in images]
+                for images in batch_images
+            ]
+
+        if do_normalize:
+            batch_images = [
+                [
+                    self.normalize(image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                    for image in images
+                ]
+                for images in batch_images
+            ]
+
+        if data_format is not None:
+            batch_images = [
+                [to_channel_dimension_format(image, data_format, input_data_format) for image in images]
+                for images in batch_images
+            ]
+
+        data = {
+            "images": batch_images,
+            "image_unpadded_heights": image_unpadded_heights,
+            "image_unpadded_widths": image_unpadded_widths,
+            "image_scale_factors": image_scale_factors,
+        }
+        return FuyuBatchFeature(data=data, tensor_type=return_tensors)
+
+    def get_num_patches(self, image_height: int, image_width: int, patch_size: Optional[dict[str, int]] = None) -> int:
+        """
+        Calculate number of patches required to encode an image.
+
+        Args:
+            image_height (`int`):
+                Height of the image.
+            image_width (`int`):
+                Width of the image.
+            patch_size (`dict[str, int]`, *optional*, defaults to `self.patch_size`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
+        """
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]
+
+        if image_height % patch_height != 0:
+            raise ValueError(f"{image_height=} must be divisible by {patch_height}")
+        if image_width % patch_width != 0:
+            raise ValueError(f"{image_width=} must be divisible by {patch_width}")
+
+        num_patches_per_dim_h = image_height // patch_height
+        num_patches_per_dim_w = image_width // patch_width
+        num_patches = num_patches_per_dim_h * num_patches_per_dim_w
+        return num_patches
+
+    def patchify_image(self, image: "torch.Tensor", patch_size: Optional[dict[str, int]] = None) -> "torch.Tensor":
+        """
+        Convert an image into a tensor of patches.
+
+        Args:
+            image (`torch.Tensor`):
+                Image to convert. Shape: [batch, channels, height, width]
+            patch_size (`dict[str, int]`, *optional*, defaults to `self.patch_size`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
+        """
+        requires_backends(self, ["torch"])
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        patch_height, patch_width = patch_size["height"], patch_size["width"]
+
+        # TODO refer to https://github.com/ArthurZucker/transformers/blob/0f0a3fe5ca5697ee58faeb5b53f049af720b5e98/src/transformers/models/vit_mae/modeling_vit_mae.py#L871
+        # torch implementation is faster but does not handle non-squares
+
+        batch_size, channels, _, _ = image.shape
+        unfolded_along_height = image.unfold(2, patch_height, patch_height)
+        patches = unfolded_along_height.unfold(3, patch_width, patch_width)
+        patches = patches.contiguous()
+        patches = patches.view(batch_size, channels, -1, patch_height, patch_width)
+        patches = patches.permute(0, 2, 3, 4, 1)
+        patches = patches.reshape(batch_size, -1, channels * patch_height * patch_width)
+        return patches
+
+    def preprocess_with_tokenizer_info(
+        self,
+        image_input: "torch.Tensor",
+        image_present: "torch.Tensor",
+        image_unpadded_h: "torch.Tensor",
+        image_unpadded_w: "torch.Tensor",
+        image_placeholder_id: int,
+        image_newline_id: int,
+        variable_sized: bool,
+        patch_size: Optional[dict[str, int]] = None,
+    ) -> FuyuBatchFeature:
+        """Process images for model input. In particular, variable-sized images are handled here.
+
+        Args:
+            image_input (`torch.Tensor` of shape [batch_size, subsequence_size, num_channels, height, width]):
+                Tensor of images padded to model input size.
+            image_present (`torch.Tensor` of shape [batch_size, subsequence_size, num_images]):
+                Tensor of 1s and 0s indicating whether an image is present.
+            image_unpadded_h (`torch.Tensor` of shape [batch_size, subsequence_size]):
+                Tensor of unpadded image heights.
+            image_unpadded_w (`torch.Tensor` of shape [batch_size, subsequence_size]):
+                Tensor of unpadded image widths.
+            image_placeholder_id (int):
+                The id of the image placeholder token. Comes from an associated tokenizer.
+            image_newline_id (int):
+                The id of the image newline token. Comes from an associated tokenizer.
+            variable_sized (bool):
+                Whether to process images as variable-sized.
+            patch_size (`dict[str, int]`, *optional*, defaults to `self.patch_size`):
+                Size of the patches.
+        """
+        requires_backends(self, ["torch"])
+
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        patch_height, patch_width = patch_size["height"], patch_size["width"]
+
+        # Only images that are present.
+        images: list[list[torch.Tensor]] = []
+        batch_image_patches: list[list[torch.Tensor]] = []
+        # Image input ids for every subsequence, including ones with no image present.
+        batch_image_input_ids: list[list[torch.Tensor]] = []
+        for batch_index in range(image_input.shape[0]):
+            image_input_ids = []
+            image_patches = []
+            for subseq_index in range(image_input.shape[1]):
+                if image_present[batch_index, subseq_index]:
+                    image = image_input[batch_index, subseq_index]
+                    image_height, image_width = image.shape[1], image.shape[2]
+                    if variable_sized:
+                        # The min() is required here due to floating point issues:
+                        # math.ceil(torch.tensor(300).cuda() / 30) == 11
+                        new_h = min(
+                            image_height,
+                            math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height,
+                        )
+                        new_w = min(
+                            image_width,
+                            math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width,
+                        )
+                        image = image[:, :new_h, :new_w]
+                        image_height, image_width = new_h, new_w
+
+                    num_patches = self.get_num_patches(image_height=image_height, image_width=image_width)
+                    tensor_of_image_ids = torch.full(
+                        [num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
+                    )
+                    patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0)
+                    assert num_patches == patches.shape[0]
+
+                    if variable_sized:
+                        # Now terminate each line with |NEWLINE|.
+                        tensor_of_image_ids = tensor_of_image_ids.reshape(-1, image_width // patch_width)
+                        newline_ids = torch.full(
+                            [tensor_of_image_ids.shape[0], 1],
+                            image_newline_id,
+                            dtype=torch.int32,
+                            device=image_input.device,
+                        )
+                        tensor_of_image_ids = torch.cat([tensor_of_image_ids, newline_ids], dim=1)
+                        tensor_of_image_ids = tensor_of_image_ids.reshape(-1)
+
+                    images.append([image])
+                    image_input_ids.append(tensor_of_image_ids)
+                    image_patches.append(patches)
+                else:
+                    image_input_ids.append(torch.tensor([], dtype=torch.int32, device=image_input.device))
+
+            batch_image_input_ids.append(image_input_ids)
+            batch_image_patches.append(image_patches)
+
+        # Create image_patch_input_indices, where non-negative values correspond to image patches to be inserted in
+        # the stream.
+        image_patch_indices_per_batch: list[list[torch.Tensor]] = []
+        image_patch_indices_per_subsequence: list[list[torch.Tensor]] = []
+
+        for sample_image_input_ids in batch_image_input_ids:
+            index_offset = 0
+            per_batch_indices = []
+            per_subsequence_indices = []
+            for subseq_image_input_ids in sample_image_input_ids:
+                # Indices of image patches.
+                patches_mask = subseq_image_input_ids == image_placeholder_id
+                num_patches = torch.count_nonzero(patches_mask)
+                indices = torch.arange(num_patches, dtype=torch.int64, device=subseq_image_input_ids.device).type_as(
+                    subseq_image_input_ids
+                )
+
+                # Place those indices in the image input ids token stream, with -1 representing non-index tokens.
+                indices_in_stream_per_batch = torch.full_like(subseq_image_input_ids, -1)
+                indices_in_stream_per_subsequence = torch.full_like(subseq_image_input_ids, -1)
+                patches_inds = torch.nonzero(patches_mask, as_tuple=True)[0]
+
+                indices_in_stream_per_batch[patches_inds] = indices + index_offset
+                indices_in_stream_per_subsequence[patches_inds] = indices
+
+                per_batch_indices.append(indices_in_stream_per_batch)
+                per_subsequence_indices.append(indices_in_stream_per_subsequence)
+                index_offset += num_patches
+
+            image_patch_indices_per_batch.append(per_batch_indices)
+            image_patch_indices_per_subsequence.append(per_subsequence_indices)
+
+        return FuyuBatchFeature(
+            data={
+                "images": images,
+                "image_input_ids": batch_image_input_ids,
+                "image_patches": batch_image_patches,
+                "image_patch_indices_per_batch": image_patch_indices_per_batch,
+                "image_patch_indices_per_subsequence": image_patch_indices_per_subsequence,
+            }
+        )
+
+
+__all__ = ["FuyuImageProcessor"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/fuyu/modeling_fuyu.py b/venv/lib/python3.13/site-packages/transformers/models/fuyu/modeling_fuyu.py
new file mode 100644
index 0000000000000000000000000000000000000000..2095e9877c2c7759dd1d8668b65ecbb2f980ca30
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/fuyu/modeling_fuyu.py
@@ -0,0 +1,407 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Fuyu model."""
+
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ...cache_utils import Cache
+from ...generation import GenerationMixin
+from ...modeling_outputs import CausalLMOutputWithPast
+from ...modeling_utils import PreTrainedModel
+from ...models.auto.modeling_auto import AutoModel
+from ...utils import auto_docstring, can_return_tuple, logging
+from .configuration_fuyu import FuyuConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@auto_docstring
+class FuyuPreTrainedModel(PreTrainedModel):
+    config: FuyuConfig
+    base_model_prefix = "fuyu"
+    supports_gradient_checkpointing = True
+    _supports_attention_backend = True
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _no_split_modules = []
+    _skip_keys_device_placement = "past_key_values"
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+@auto_docstring(
+    custom_intro="""
+    The Fuyu model which consists of a vision backbone and a language model, without a language modeling head.
+    """
+)
+class FuyuModel(FuyuPreTrainedModel):
+    _checkpoint_conversion_mapping = {"language_model.model": "language_model"}
+
+    def __init__(self, config: FuyuConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.text_config.vocab_size
+        self.language_model = AutoModel.from_config(config.text_config)
+        self.vision_embed_tokens = nn.Linear(
+            config.patch_size * config.patch_size * config.num_channels, config.hidden_size
+        )
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+
+    def get_decoder(self):
+        return self.language_model
+
+    def gather_continuous_embeddings(
+        self,
+        word_embeddings: torch.Tensor,
+        continuous_embeddings: list[torch.Tensor],
+        image_patch_input_indices: torch.Tensor,
+    ) -> torch.Tensor:
+        """This function places the continuous_embeddings into the word_embeddings at the locations
+        indicated by image_patch_input_indices. Different batch elements can have different numbers of continuous
+        embeddings.
+
+        Args:
+            word_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Tensor of word embeddings.
+            continuous_embeddings (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
+                Tensor of continuous embeddings. The length of the list is the batch size. Each entry is shape
+                [num_image_embeddings, hidden], and num_image_embeddings needs to match the number of non-negative
+                indices in image_patch_input_indices for that batch element.
+            image_patch_input_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Tensor of indices of the image patches in the input_ids tensor.
+        """
+        if not (word_embeddings.shape[0] == len(continuous_embeddings)):
+            raise ValueError(
+                f"Batch sizes must match! Got {len(continuous_embeddings)=} and {word_embeddings.shape[0]=}"
+            )
+
+        output_embeddings = word_embeddings.clone()
+        for batch_idx in range(word_embeddings.shape[0]):
+            # First, find the positions of all the non-negative values in image_patch_input_indices, those are the
+            # positions in word_embeddings that we want to replace with content from continuous_embeddings.
+            dst_indices = torch.nonzero(image_patch_input_indices[batch_idx] >= 0, as_tuple=True)[0]
+            # Next look up those indices in image_patch_input_indices to find the indices in continuous_embeddings that we
+            # want to use to replace the values in word_embeddings.
+            src_indices = image_patch_input_indices[batch_idx][dst_indices]
+            # Check if we have more indices than embeddings. Note that we could have fewer indices if images got truncated.
+            if src_indices.shape[0] > continuous_embeddings[batch_idx].shape[0]:
+                raise ValueError(
+                    f"Number of continuous embeddings {continuous_embeddings[batch_idx].shape=} does not match "
+                    f"number of continuous token ids {src_indices.shape=} in batch element {batch_idx}."
+                )
+            output_embeddings[batch_idx, dst_indices] = continuous_embeddings[batch_idx][src_indices].to(
+                output_embeddings.device
+            )
+        return output_embeddings
+
+    def get_image_features(self, pixel_values: torch.FloatTensor, **kwargs):
+        """
+        Encodes images into continuous embeddings that can be forwarded to the language model.
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+                The tensors corresponding to the input images.
+        """
+        patch_embeddings = [
+            self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype)).squeeze(0)
+            for patch in pixel_values
+        ]
+        return patch_embeddings
+
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        n_image_features = image_features.shape[0] * image_features.shape[1]
+        if inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+            )
+        return special_image_mask
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        # [batch_size, num_total_patches, patch_size_ x patch_size x num_channels ]
+        image_patches: Optional[torch.Tensor] = None,
+        image_patches_indices: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple, CausalLMOutputWithPast]:
+        r"""
+        image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
+            Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
+            hidden size of the model.
+        image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Tensor of indices of the image patches in the input_ids tensor.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either input_is or inputs_embeds")
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+
+        if image_patches is not None:
+            patch_embeddings = self.get_image_features(image_patches)
+            patch_embeddings = torch.cat(patch_embeddings, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=patch_embeddings
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, patch_embeddings)
+
+        outputs = self.language_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            use_cache=use_cache,
+            return_dict=return_dict,
+            **kwargs,
+        )
+
+        return outputs
+
+
+@auto_docstring(
+    custom_intro="""
+    Fuyu Model with a language modeling head on top for causal language model conditioned on image patches and text.
+    """
+)
+class FuyuForCausalLM(FuyuPreTrainedModel, GenerationMixin):
+    _checkpoint_conversion_mapping = {
+        "^language_model.model": "model.language_model",
+        "^vision_embed_tokens": "model.vision_embed_tokens",
+        "^language_model.lm_head": "lm_head",
+    }
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: FuyuConfig):
+        super().__init__(config)
+        self.model = FuyuModel(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        # [batch_size, num_total_patches, patch_size_ x patch_size x num_channels ]
+        image_patches: Optional[torch.Tensor] = None,
+        image_patches_indices: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        logits_to_keep: Optional[int] = 0,
+        **kwargs,
+    ) -> Union[tuple, CausalLMOutputWithPast]:
+        r"""
+        image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
+            Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
+            hidden size of the model.
+        image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Tensor of indices of the image patches in the input_ids tensor.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
+
+        Examples:
+
+        ```python
+        >>> from transformers import FuyuProcessor, FuyuForCausalLM
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> processor = FuyuProcessor.from_pretrained("adept/fuyu-8b")
+        >>> model = FuyuForCausalLM.from_pretrained("adept/fuyu-8b")
+
+        >>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> prompt = "Generate a coco-style caption.\n"
+
+        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> generated_ids = model.generate(**inputs, max_new_tokens=7)
+        >>> generation_text = processor.batch_decode(generated_ids[:, -7:], skip_special_tokens=True)
+        >>> print(generation_text[0])
+        A blue bus parked on the side of a road.
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids=input_ids,
+            image_patches=image_patches,
+            image_patches_indices=image_patches_indices,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            use_cache=use_cache,
+            return_dict=True,
+            # don't pass kwargs because Persimmon-backbone doesn't accept FA2 kwargs yet, TODO: raushan
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+            )
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        image_patches=None,
+        image_patches_indices=None,
+        cache_position=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            image_patches=image_patches,
+            image_patches_indices=image_patches_indices,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        if cache_position[0] != 0:
+            # set image_patches and image_patches_indices to `None` for decoding stage
+            model_inputs["image_patches_indices"] = None
+            model_inputs["image_patches"] = None
+
+        return model_inputs
+
+
+__all__ = ["FuyuForCausalLM", "FuyuPreTrainedModel", "FuyuModel"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/fuyu/processing_fuyu.py b/venv/lib/python3.13/site-packages/transformers/models/fuyu/processing_fuyu.py
new file mode 100644
index 0000000000000000000000000000000000000000..debbcb23aac1f9df63f052bdb39883a22efc71b9
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/fuyu/processing_fuyu.py
@@ -0,0 +1,793 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image/Text processor class for GIT
+"""
+
+import re
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_utils import ImageInput
+from ...processing_utils import (
+    MultiModalData,
+    ProcessingKwargs,
+    ProcessorMixin,
+    Unpack,
+)
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import is_torch_available, logging, requires_backends
+from ...utils.import_utils import requires
+
+
+if is_torch_available():
+    from .image_processing_fuyu import FuyuBatchFeature
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_torch_available():
+    import torch
+
+
+TEXT_REPR_BBOX_OPEN = "<box>"
+TEXT_REPR_BBOX_CLOSE = "</box>"
+TEXT_REPR_POINT_OPEN = "<point>"
+TEXT_REPR_POINT_CLOSE = "</point>"
+
+TOKEN_BBOX_OPEN_STRING = "<0x00>"  # <bbox>
+TOKEN_BBOX_CLOSE_STRING = "<0x01>"  # </bbox>
+TOKEN_POINT_OPEN_STRING = "<0x02>"  # <point>
+TOKEN_POINT_CLOSE_STRING = "<0x03>"  # </point>
+BEGINNING_OF_ANSWER_STRING = "<0x04>"  # <boa>
+
+
+class FuyuProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "add_special_tokens": True,
+            "padding": False,
+            "stride": 0,
+            "return_attention_mask": True,
+            "return_overflowing_tokens": False,
+            "return_special_tokens_mask": False,
+            "return_offsets_mapping": False,
+            "return_token_type_ids": False,
+            "return_length": False,
+            "verbose": True,
+            "return_mm_token_type_ids": False,
+        },
+        "images_kwargs": {},
+    }
+
+
+def full_unpacked_stream_to_tensor(
+    all_bi_tokens_to_place: list[int],
+    full_unpacked_stream: list["torch.Tensor"],
+    fill_value: int,
+    batch_size: int,
+    new_seq_len: int,
+    offset: int,
+) -> "torch.Tensor":
+    """Takes an unpacked stream of tokens (i.e. a list of tensors, one for each item in the batch) and does
+    the required padding to create a single tensor for the batch of shape batch_size x new_seq_len.
+    """
+
+    assert len(all_bi_tokens_to_place) == batch_size
+    assert len(full_unpacked_stream) == batch_size
+
+    # Create padded tensors for the full batch.
+    new_padded_tensor = torch.full(
+        [batch_size, new_seq_len],
+        fill_value=fill_value,
+        dtype=full_unpacked_stream[0].dtype,
+        device=full_unpacked_stream[0].device,
+    )
+
+    # Place each batch entry into the batch tensor.
+    for bi in range(batch_size):
+        tokens_to_place = all_bi_tokens_to_place[bi]
+        new_padded_tensor[bi, :tokens_to_place] = full_unpacked_stream[bi][offset : tokens_to_place + offset]
+
+    return new_padded_tensor
+
+
+def construct_full_unpacked_stream(
+    num_real_text_tokens: Union[list[list[int]], "torch.Tensor"],
+    input_stream: "torch.Tensor",
+    image_tokens: list[list["torch.Tensor"]],
+    batch_size: int,
+    num_sub_sequences: int,
+) -> list["torch.Tensor"]:
+    """Takes an input_stream tensor of shape B x S x ?. For each subsequence, adds any required
+    padding to account for images and then unpacks the subsequences to create a single sequence per item in the batch.
+    Returns a list of tensors, one for each item in the batch."""
+
+    all_bi_stream = []
+
+    for batch_index in range(batch_size):
+        all_si_stream = []
+
+        # First, construct full token stream (including image placeholder tokens) and loss mask for each subsequence
+        # and append to lists. We use lists rather than tensors because each subsequence is variable-sized.
+        # TODO Remove this logic in a subsequent release since subsequences are not supported.
+        image_adjustment = image_tokens[batch_index][0]
+        subsequence_stream = torch.cat([image_adjustment, input_stream[batch_index, 0]], dim=0)
+        num_real_tokens = image_adjustment.shape[0] + num_real_text_tokens[batch_index][0]
+        all_si_stream.append(subsequence_stream[:num_real_tokens])
+        all_bi_stream.append(torch.cat(all_si_stream, dim=0))
+
+    return all_bi_stream
+
+
+def _replace_string_repr_with_token_tags(prompt: str) -> str:
+    prompt = prompt.replace(TEXT_REPR_POINT_OPEN, TOKEN_POINT_OPEN_STRING)
+    prompt = prompt.replace(TEXT_REPR_POINT_CLOSE, TOKEN_POINT_CLOSE_STRING)
+    prompt = prompt.replace(TEXT_REPR_BBOX_OPEN, TOKEN_BBOX_OPEN_STRING)
+    prompt = prompt.replace(TEXT_REPR_BBOX_CLOSE, TOKEN_BBOX_CLOSE_STRING)
+    return prompt
+
+
+def _segment_prompt_into_text_token_conversions(prompt: str) -> list:
+    """
+    Given a string prompt, converts the prompt into a list of TextTokenConversions.
+    """
+    # Wherever, we notice the [TOKEN_OPEN_STRING, TOKEN_CLOSE_STRING], we split the prompt
+    prompt_text_list: list = []
+    regex_pattern = re.compile(
+        f"({TOKEN_BBOX_OPEN_STRING}|{TOKEN_BBOX_CLOSE_STRING}|{TOKEN_POINT_OPEN_STRING}|{TOKEN_POINT_CLOSE_STRING})"
+    )
+    # Split by the regex pattern
+    prompt_split = regex_pattern.split(prompt)
+    for i, elem in enumerate(prompt_split):
+        if len(elem) == 0 or elem in [
+            TOKEN_BBOX_OPEN_STRING,
+            TOKEN_BBOX_CLOSE_STRING,
+            TOKEN_POINT_OPEN_STRING,
+            TOKEN_POINT_CLOSE_STRING,
+        ]:
+            continue
+        prompt_text_list.append(
+            (elem, i > 1 and prompt_split[i - 1] in [TOKEN_BBOX_OPEN_STRING, TOKEN_POINT_OPEN_STRING])
+        )
+    return prompt_text_list
+
+
+def _transform_coordinates_and_tokenize(prompt: str, scale_factor: float, tokenizer) -> list[int]:
+    """
+    This function transforms the prompt in the following fashion:
+    - <box> <point> and </box> </point> to their respective token mappings
+    - extract the coordinates from the tag
+    - transform the coordinates into the transformed image space
+    - return the prompt tokens with the transformed coordinates and new tags
+
+    Bounding boxes and points MUST be in the following format: <box>y1, x1, y2, x2</box> <point>x, y</point> The spaces
+    and punctuation added above are NOT optional.
+    """
+    # Make a namedtuple that stores "text" and "is_bbox"
+
+    # We want to do the following: Tokenize the code normally -> when we see a point or box, tokenize using the tokenize_within_tag function
+    # When point or box close tag, continue tokenizing normally
+    # First, we replace the point and box tags with their respective tokens
+    prompt = _replace_string_repr_with_token_tags(prompt)
+    # Tokenize the prompt
+    # Convert prompt into a list split
+    prompt_text_list = _segment_prompt_into_text_token_conversions(prompt)
+    transformed_prompt_tokens: list[int] = []
+    for elem in prompt_text_list:
+        if elem[1]:
+            # This is a location, we need to tokenize it
+            within_tag_tokenized = _transform_within_tags(elem[0], scale_factor, tokenizer)
+            # Surround the text with the open and close tags
+            transformed_prompt_tokens.extend(within_tag_tokenized)
+        else:
+            transformed_prompt_tokens.extend(tokenizer(elem[0], add_special_tokens=False).input_ids)
+    return transformed_prompt_tokens
+
+
+def _transform_within_tags(text: str, scale_factor: float, tokenizer) -> list[int]:
+    """
+    Given a bounding box of the fashion <box>1, 2, 3, 4</box> | <point>1, 2</point> This function is responsible for
+    converting 1, 2, 3, 4 into tokens of 1 2 3 4 without any commas.
+    """
+    # Convert the text into a list of strings.
+    num_int_strs = text.split(",")
+    if len(num_int_strs) == 2:
+        # If there are any open or close tags, remove them.
+        token_space_open_string = tokenizer.vocab[TOKEN_POINT_OPEN_STRING]
+        token_space_close_string = tokenizer.vocab[TOKEN_POINT_CLOSE_STRING]
+    else:
+        token_space_open_string = tokenizer.vocab[TOKEN_BBOX_OPEN_STRING]
+        token_space_close_string = tokenizer.vocab[TOKEN_BBOX_CLOSE_STRING]
+
+    # Remove all spaces from num_ints
+    num_ints = [float(num.strip()) for num in num_int_strs]
+    # scale to transformed image size
+    if len(num_ints) == 2:
+        num_ints_translated = scale_point_to_transformed_image(x=num_ints[0], y=num_ints[1], scale_factor=scale_factor)
+    elif len(num_ints) == 4:
+        num_ints_translated = scale_bbox_to_transformed_image(
+            top=num_ints[0],
+            left=num_ints[1],
+            bottom=num_ints[2],
+            right=num_ints[3],
+            scale_factor=scale_factor,
+        )
+    else:
+        raise ValueError(f"Invalid number of ints: {len(num_ints)}")
+    # Tokenize the text, skipping the
+    tokens = [tokenizer.vocab[str(num)] for num in num_ints_translated]
+    return [token_space_open_string] + tokens + [token_space_close_string]
+
+
+def _tokenize_prompts_with_image_and_batch(
+    tokenizer,
+    prompts: list[list[str]],
+    scale_factors: Optional[list[list["torch.Tensor"]]],
+    max_tokens_to_generate: int,
+    max_position_embeddings: int,
+    add_BOS: bool,  # Same issue with types as above
+    add_beginning_of_answer_token: bool,
+) -> tuple["torch.Tensor", "torch.Tensor"]:
+    """
+    Given a set of prompts and number of tokens to generate:
+    - tokenize prompts
+    - set the sequence length to be the max of length of prompts plus the number of tokens we would like to generate
+    - pad all the sequences to this length so we can convert them into a 3D tensor.
+    """
+
+    # If not tool use, transform the coordinates while tokenizing
+    if scale_factors is not None:
+        transformed_prompt_tokens = []
+        for prompt_seq, scale_factor_seq in zip(prompts, scale_factors):
+            transformed_prompt_tokens.append(
+                [
+                    _transform_coordinates_and_tokenize(prompt, scale_factor.item(), tokenizer)
+                    for prompt, scale_factor in zip(prompt_seq, scale_factor_seq)
+                ]
+            )
+    else:
+        transformed_prompt_tokens = [[tokenizer.tokenize(prompt) for prompt in prompt_seq] for prompt_seq in prompts]
+
+    prompts_tokens = transformed_prompt_tokens
+
+    if add_BOS:
+        bos_token = tokenizer.vocab["<s>"]
+    else:
+        bos_token = tokenizer.vocab["|ENDOFTEXT|"]
+    prompts_tokens = [[[bos_token] + x for x in prompt_seq] for prompt_seq in prompts_tokens]
+    if add_beginning_of_answer_token:
+        beginning_of_answer = tokenizer.vocab[BEGINNING_OF_ANSWER_STRING]
+        # Only add bbox open token to the last subsequence since that is what will be completed
+        for token_seq in prompts_tokens:
+            token_seq[-1].append(beginning_of_answer)
+
+    # Now we have a list of list of tokens which each list has a different
+    # size. We want to extend this list to:
+    #   - incorporate the tokens that need to be generated
+    #   - make all the sequences equal length.
+    # Get the prompts length.
+
+    prompts_length = [[len(x) for x in prompts_tokens_seq] for prompts_tokens_seq in prompts_tokens]
+    # Get the max prompts length.
+    max_prompt_len: int = np.max(prompts_length)
+    # Number of tokens in the each sample of the batch.
+    samples_length = min(max_prompt_len + max_tokens_to_generate, max_position_embeddings)
+    if max_prompt_len + max_tokens_to_generate > max_position_embeddings:
+        logger.warning(
+            f"Max subsequence prompt length of {max_prompt_len} + max tokens to generate {max_tokens_to_generate}",
+            f"exceeds context length of {max_position_embeddings}. Will generate as many tokens as possible.",
+        )
+    # Now update the list of list to be of the same size: samples_length.
+    for prompt_tokens_seq, prompts_length_seq in zip(prompts_tokens, prompts_length):
+        for prompt_tokens, prompt_length in zip(prompt_tokens_seq, prompts_length_seq):
+            if len(prompt_tokens) > samples_length:
+                raise ValueError("Length of subsequence prompt exceeds sequence length.")
+            padding_size = samples_length - prompt_length
+            prompt_tokens.extend([tokenizer.vocab["|ENDOFTEXT|"]] * padding_size)
+
+    # Now we are in a structured format, we can convert to tensors.
+    prompts_tokens_tensor = torch.tensor(prompts_tokens, dtype=torch.int64)
+    prompts_length_tensor = torch.tensor(prompts_length, dtype=torch.int64)
+
+    return prompts_tokens_tensor, prompts_length_tensor
+
+
+# Simplified assuming self.crop_top = self.padding_top = 0
+def original_to_transformed_h_coords(original_coords, scale_h):
+    return np.round(original_coords * scale_h).astype(np.int32)
+
+
+# Simplified assuming self.crop_left = self.padding_left = 0
+def original_to_transformed_w_coords(original_coords, scale_w):
+    return np.round(original_coords * scale_w).astype(np.int32)
+
+
+def scale_point_to_transformed_image(x: float, y: float, scale_factor: float) -> list[int]:
+    x_scaled = original_to_transformed_w_coords(np.array([x / 2]), scale_factor)[0]
+    y_scaled = original_to_transformed_h_coords(np.array([y / 2]), scale_factor)[0]
+    return [x_scaled, y_scaled]
+
+
+def scale_bbox_to_transformed_image(
+    top: float, left: float, bottom: float, right: float, scale_factor: float
+) -> list[int]:
+    top_scaled = original_to_transformed_w_coords(np.array([top / 2]), scale_factor)[0]
+    left_scaled = original_to_transformed_h_coords(np.array([left / 2]), scale_factor)[0]
+    bottom_scaled = original_to_transformed_w_coords(np.array([bottom / 2]), scale_factor)[0]
+    right_scaled = original_to_transformed_h_coords(np.array([right / 2]), scale_factor)[0]
+    return [top_scaled, left_scaled, bottom_scaled, right_scaled]
+
+
+@requires(backends=("vision",))
+class FuyuProcessor(ProcessorMixin):
+    r"""
+    Constructs a Fuyu processor which wraps a Fuyu image processor and a Llama tokenizer into a single processor.
+
+    [`FuyuProcessor`] offers all the functionalities of [`FuyuImageProcessor`] and [`LlamaTokenizerFast`]. See the
+    [`~FuyuProcessor.__call__`] and [`~FuyuProcessor.decode`] for more information.
+
+    Args:
+        image_processor ([`FuyuImageProcessor`]):
+            The image processor is a required input.
+        tokenizer ([`LlamaTokenizerFast`]):
+            The tokenizer is a required input.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "FuyuImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(self, image_processor, tokenizer, **kwargs):
+        super().__init__(image_processor=image_processor, tokenizer=tokenizer)
+        self.image_processor = image_processor
+        self.tokenizer = tokenizer
+        self.max_tokens_to_generate = 10
+        self.max_position_embeddings = 16384  # TODO Can't derive this from model files: where to set it?
+        self.pad_token_id = 0
+        self.dummy_image_index = -1
+        self.image_token_id = tokenizer.encode("|SPEAKER|", add_special_tokens=False)[1]
+        self.image_newline_id = tokenizer.encode("|NEWLINE|", add_special_tokens=False)[1]
+
+    def _left_pad_inputs_with_attention_mask(self, model_inputs: list[dict], return_attention_mask: bool):
+        max_length_input_ids = max(entry["input_ids"].shape[1] for entry in model_inputs)
+        max_length_image_patch_indices = max(entry["image_patches_indices"].shape[1] for entry in model_inputs)
+
+        batched_inputs = {"input_ids": [], "image_patches": [], "image_patches_indices": [], "attention_mask": []}
+
+        for entry in model_inputs:
+            for key, tensor in entry.items():
+                if key == "input_ids":
+                    num_padding_tokens = max_length_input_ids - tensor.shape[1]
+                    padded_input_ids = torch.cat(
+                        [
+                            torch.full((tensor.shape[0], num_padding_tokens), self.pad_token_id, dtype=torch.long),
+                            tensor,
+                        ],
+                        dim=1,
+                    )
+                    batched_inputs[key].append(padded_input_ids)
+
+                    attention_mask = torch.cat(
+                        [torch.zeros(tensor.shape[0], num_padding_tokens, dtype=torch.long), torch.ones_like(tensor)],
+                        dim=1,
+                    )
+                    batched_inputs["attention_mask"].append(attention_mask)
+
+                elif key == "image_patches":
+                    # For image_patches, we don't pad but just append them to the list.
+                    batched_inputs[key].append(tensor)
+
+                else:  # for image_patches_indices
+                    num_padding_indices = max_length_image_patch_indices - tensor.shape[1]
+                    padded_indices = torch.cat(
+                        [
+                            torch.full(
+                                (tensor.shape[0], num_padding_indices), self.dummy_image_index, dtype=torch.long
+                            ),
+                            tensor,
+                        ],
+                        dim=1,
+                    )
+                    batched_inputs[key].append(padded_indices)
+        batched_keys = ["input_ids", "image_patches_indices"]
+        if return_attention_mask:
+            batched_keys.append("attention_mask")
+        for key in batched_keys:
+            batched_inputs[key] = torch.cat(batched_inputs[key], dim=0)
+
+        # Cast images to tensor as well, if only one image passed and no padding needed
+        # NOTE: vLLM expects all processor outputs to be a tensor
+        if len(batched_inputs["image_patches"]) == 1:
+            batched_inputs["image_patches"] = torch.cat(batched_inputs["image_patches"], dim=0)
+
+        return batched_inputs
+
+    def get_sample_encoding(
+        self,
+        prompts,
+        scale_factors,
+        image_unpadded_heights,
+        image_unpadded_widths,
+        image_placeholder_id,
+        image_newline_id,
+        tensor_batch_images,
+    ):
+        image_present = torch.ones(1, 1, 1)
+        model_image_input = self.image_processor.preprocess_with_tokenizer_info(
+            image_input=tensor_batch_images,
+            image_present=image_present,
+            image_unpadded_h=image_unpadded_heights,
+            image_unpadded_w=image_unpadded_widths,
+            image_placeholder_id=image_placeholder_id,
+            image_newline_id=image_newline_id,
+            variable_sized=True,
+        )
+        # FIXME max_tokens_to_generate is embedded into this processor's call.
+        prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
+            tokenizer=self.tokenizer,
+            prompts=prompts,
+            scale_factors=scale_factors,
+            max_tokens_to_generate=self.max_tokens_to_generate,
+            max_position_embeddings=self.max_position_embeddings,
+            add_BOS=True,
+            add_beginning_of_answer_token=True,
+        )
+        image_padded_unpacked_tokens = construct_full_unpacked_stream(
+            num_real_text_tokens=prompts_length,
+            input_stream=prompt_tokens,
+            image_tokens=model_image_input["image_input_ids"],
+            batch_size=1,
+            num_sub_sequences=self.subsequence_length,
+        )
+        # Construct inputs for image patch indices.
+        unpacked_image_patch_indices_per_batch = construct_full_unpacked_stream(
+            num_real_text_tokens=prompts_length,
+            input_stream=torch.full_like(prompt_tokens, -1),
+            image_tokens=model_image_input["image_patch_indices_per_batch"],
+            batch_size=1,
+            num_sub_sequences=self.subsequence_length,
+        )
+        max_prompt_length = max(x.shape[-1] for x in image_padded_unpacked_tokens)
+        max_seq_len_batch = min(max_prompt_length + self.max_tokens_to_generate, self.max_position_embeddings)
+        tokens_to_place = min(max_seq_len_batch, max(0, image_padded_unpacked_tokens[0].shape[0]))
+
+        # Use same packing logic for the image patch indices.
+        image_patch_input_indices = full_unpacked_stream_to_tensor(
+            all_bi_tokens_to_place=[tokens_to_place],
+            full_unpacked_stream=unpacked_image_patch_indices_per_batch,
+            fill_value=-1,
+            batch_size=1,
+            new_seq_len=max_seq_len_batch,
+            offset=0,
+        )
+        image_patches_tensor = torch.stack([img[0] for img in model_image_input["image_patches"]])
+        batch_encoding = {
+            "input_ids": image_padded_unpacked_tokens[0].unsqueeze(0),
+            "image_patches": image_patches_tensor,
+            "image_patches_indices": image_patch_input_indices,
+        }
+        return batch_encoding
+
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[FuyuProcessorKwargs],
+    ) -> "FuyuBatchFeature":
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to
+        encode the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
+        FuyuImageProcessor's [`~FuyuImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
+        of the above two methods for more information.
+
+        Args:
+            images (`PIL.Image.Image`, `list[PIL.Image.Image]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `list[str]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+
+        Returns:
+            [`FuyuBatchEncoding`]: A [`FuyuBatchEncoding`] with the following fields:
+
+            - **input_ids** -- Tensor of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **image_patches** -- List of Tensor of image patches. Returned when `images` is not `None`.
+            - **image_patches_indices** -- Tensor of indices where patch embeddings have to be inserted by the model.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model when
+              `return_attention_mask=True`.
+        """
+        requires_backends(self, ["torch"])
+
+        # --- Check input validity ---
+        if text is None and images is None:
+            raise ValueError("You have to specify either text or images. Both cannot be None.")
+
+        output_kwargs = self._merge_kwargs(
+            FuyuProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
+
+        if not output_kwargs["text_kwargs"].setdefault("return_attention_mask", True):
+            raise ValueError("`return_attention_mask=False` is not supported for this model.")
+
+        if text is not None and images is None:
+            logger.warning("You are processing a text with no associated image. Make sure it is intended.")
+            self.current_processor = self.tokenizer
+            text_encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
+            return text_encoding
+
+        if text is None and images is not None:
+            logger.warning("You are processing an image with no associated text. Make sure it is intended.")
+            prompts = [[""]]
+        if text is not None and images is not None:
+            if isinstance(text, str):
+                prompts = [[text]]
+            elif isinstance(text, list):
+                prompts = [[text_seq] for text_seq in text]
+
+        # --- Preprocess images using self.image_processor ---
+
+        # FIXME - We hard code "pt" here because the rest of the processing assumes torch tensors
+        output_kwargs["images_kwargs"]["return_tensors"] = "pt"
+        image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"])
+        batch_images = image_encoding["images"]
+        image_unpadded_heights = image_encoding["image_unpadded_heights"]
+        image_unpadded_widths = image_encoding["image_unpadded_widths"]
+        scale_factors = image_encoding["image_scale_factors"]
+        self.subsequence_length = 1  # Each batch contains only one sequence.
+        self.batch_size = len(batch_images)
+
+        # --- Use self.tokenizer to get the ids of special tokens to insert into image ids ---
+
+        tensor_batch_images = torch.stack([img[0] for img in batch_images if img]).unsqueeze(1)
+
+        # --- Use self.image_processor again to obtain the full token ids and batch inputs ---
+        all_encodings = []
+
+        for prompt, scale_factor, image_unpadded_height, image_unpadded_width, tensor_batch_image in zip(
+            prompts, scale_factors, image_unpadded_heights, image_unpadded_widths, tensor_batch_images
+        ):
+            sample_encoding = self.get_sample_encoding(
+                prompts=[prompt],
+                scale_factors=[scale_factor],
+                image_unpadded_heights=torch.tensor([image_unpadded_height]),
+                image_unpadded_widths=torch.tensor([image_unpadded_width]),
+                image_placeholder_id=self.image_token_id,
+                image_newline_id=self.image_newline_id,
+                tensor_batch_images=tensor_batch_image.unsqueeze(0),
+            )
+            all_encodings.append(sample_encoding)
+
+        batch_encoding = self._left_pad_inputs_with_attention_mask(
+            model_inputs=all_encodings, return_attention_mask=True
+        )
+        if return_mm_token_type_ids:
+            input_ids = batch_encoding["input_ids"]
+            mm_token_type_ids = torch.zeros_like(input_ids)
+            mm_token_type_ids[input_ids == self.image_token_id] = 1
+            mm_token_type_ids[input_ids == self.image_newline_id] = 1
+            batch_encoding["mm_token_type_ids"] = mm_token_type_ids
+
+        return FuyuBatchFeature(data=batch_encoding)
+
+    def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
+        """
+        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
+
+        Args:
+            image_sizes (`list[list[int]]`, *optional*):
+                The input sizes formatted as (height, width) per each image.
+
+        Returns:
+            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+            input modalities, along with other useful data.
+        """
+
+        vision_data = {}
+        if image_sizes is not None:
+            size = kwargs.get("size") or self.image_processor.size
+            padded_height, padded_width = size["height"], size["width"]
+
+            num_image_tokens = []
+            num_image_patches = [1] * len(image_sizes)
+            for image_size in image_sizes:
+                height_scale_factor = padded_height / image_size[0]
+                width_scale_factor = padded_width / image_size[1]
+                optimal_scale_factor = min(height_scale_factor, width_scale_factor)
+
+                image_unpadded_h = min(int(image_size[0] * optimal_scale_factor), image_size[0])
+                image_unpadded_w = min(int(image_size[0] * optimal_scale_factor), image_size[0])
+
+                # We can use torch here because Fuyu processor has hard dependency on torch. NOTE: Fuyu can't do multi-image
+                # thus the below (1, 1, 1) is hardcoded. Same as when calling the processor
+                model_image_input = self.image_processor.preprocess_with_tokenizer_info(
+                    image_input=torch.zeros(1, 1, 3, padded_height, padded_width),
+                    image_present=torch.ones(1, 1, 1),
+                    image_unpadded_h=torch.tensor([[image_unpadded_h]]),
+                    image_unpadded_w=torch.tensor([[image_unpadded_w]]),
+                    image_placeholder_id=0,  # dummy ids, we can be sure `id=0` is never out-of-range
+                    image_newline_id=0,
+                    variable_sized=True,
+                )
+                num_image_tokens.append(model_image_input["image_input_ids"][0][0].shape[-1])
+            vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
+        return MultiModalData(**vision_data)
+
+    def post_process_box_coordinates(self, outputs, target_sizes=None):
+        """
+        Transforms raw coordinates detected by [`FuyuForCausalLM`] to the original images' coordinate space.
+        Coordinates will be returned in "box" format, with the following pattern:
+            `<box>top, left, bottom, right</box>`
+
+        Point coordinates are not supported yet.
+
+        Args:
+            outputs ([`GenerateOutput`]):
+                Raw outputs from `generate`.
+            target_sizes (`torch.Tensor`, *optional*):
+                Tensor of shape (batch_size, 2) where each entry is the (height, width) of the corresponding image in
+                the batch. If set, found coordinates in the output sequence are rescaled to the target sizes. If left
+                to None, coordinates will not be rescaled.
+
+        Returns:
+            `GenerateOutput`: Same output type returned by `generate`, with output token ids replaced with
+                boxed and possible rescaled coordinates.
+        """
+
+        def scale_factor_to_fit(original_size, target_size=None):
+            height, width = original_size
+            if target_size is None:
+                max_height = self.image_processor.size["height"]
+                max_width = self.image_processor.size["width"]
+            else:
+                max_height, max_width = target_size
+            if width <= max_width and height <= max_height:
+                return 1.0
+            return min(max_height / height, max_width / width)
+
+        def find_delimiters_pair(tokens, start_token, end_token):
+            start_id = self.tokenizer.convert_tokens_to_ids(start_token)
+            end_id = self.tokenizer.convert_tokens_to_ids(end_token)
+
+            starting_positions = (tokens == start_id).nonzero(as_tuple=True)[0]
+            ending_positions = (tokens == end_id).nonzero(as_tuple=True)[0]
+
+            if torch.any(starting_positions) and torch.any(ending_positions):
+                return (starting_positions[0], ending_positions[0])
+            return (None, None)
+
+        def tokens_to_boxes(tokens, original_size):
+            while (pair := find_delimiters_pair(tokens, TOKEN_BBOX_OPEN_STRING, TOKEN_BBOX_CLOSE_STRING)) != (
+                None,
+                None,
+            ):
+                start, end = pair
+                if end != start + 5:
+                    continue
+
+                # Retrieve transformed coordinates from tokens
+                coords = self.tokenizer.convert_ids_to_tokens(tokens[start + 1 : end])
+
+                # Scale back to original image size and multiply by 2
+                scale = scale_factor_to_fit(original_size)
+                top, left, bottom, right = [2 * int(float(c) / scale) for c in coords]
+
+                # Replace the IDs so they get detokenized right
+                replacement = f" {TEXT_REPR_BBOX_OPEN}{top}, {left}, {bottom}, {right}{TEXT_REPR_BBOX_CLOSE}"
+                replacement = self.tokenizer.tokenize(replacement)[1:]
+                replacement = self.tokenizer.convert_tokens_to_ids(replacement)
+                replacement = torch.tensor(replacement).to(tokens)
+
+                tokens = torch.cat([tokens[:start], replacement, tokens[end + 1 :]], 0)
+            return tokens
+
+        def tokens_to_points(tokens, original_size):
+            while (pair := find_delimiters_pair(tokens, TOKEN_POINT_OPEN_STRING, TOKEN_POINT_CLOSE_STRING)) != (
+                None,
+                None,
+            ):
+                start, end = pair
+                if end != start + 3:
+                    continue
+
+                # Retrieve transformed coordinates from tokens
+                coords = self.tokenizer.convert_ids_to_tokens(tokens[start + 1 : end])
+
+                # Scale back to original image size and multiply by 2
+                scale = scale_factor_to_fit(original_size)
+                x, y = [2 * int(float(c) / scale) for c in coords]
+
+                # Replace the IDs so they get detokenized right
+                replacement = f" {TEXT_REPR_POINT_OPEN}{x}, {y}{TEXT_REPR_POINT_CLOSE}"
+                replacement = self.tokenizer.tokenize(replacement)[1:]
+                replacement = self.tokenizer.convert_tokens_to_ids(replacement)
+                replacement = torch.tensor(replacement).to(tokens)
+
+                tokens = torch.cat([tokens[:start], replacement, tokens[end + 1 :]], 0)
+            return tokens
+
+        if target_sizes is None:
+            target_sizes = ((self.image_processor.size["height"], self.image_processor.size["width"]),) * len(outputs)
+        elif target_sizes.shape[1] != 2:
+            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+        if len(outputs) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as output sequences")
+
+        results = []
+        for seq, size in zip(outputs, target_sizes):
+            seq = tokens_to_boxes(seq, size)
+            seq = tokens_to_points(seq, size)
+            results.append(seq)
+
+        return results
+
+    def post_process_image_text_to_text(self, generated_outputs, skip_special_tokens=True, **kwargs):
+        """
+        Post-processes the output of `FuyuForConditionalGeneration` to only return the text output.
+
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
+                containing the token ids of the generated sequences.
+            skip_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
+            **kwargs:
+                Additional arguments to be passed to the tokenizer's `batch_decode method`.
+
+        Returns:
+            `list[str]`: The decoded text output.
+        """
+        beginning_of_answer = self.tokenizer.convert_tokens_to_ids(BEGINNING_OF_ANSWER_STRING)
+        # get boa index for each outputted sequence tensor
+        # start all generated sequences from the beginning of the answer token, pad to have consistent length
+        unpadded_output_sequences = [
+            seq[(seq == beginning_of_answer).nonzero(as_tuple=True)[0] + 1 :] for seq in generated_outputs
+        ]
+        max_len = max(len(seq) for seq in unpadded_output_sequences)
+        # convert to torch and pad sequences
+        padded_output_sequences = torch.full((len(unpadded_output_sequences), max_len), self.pad_token_id)
+        for i, seq in enumerate(unpadded_output_sequences):
+            padded_output_sequences[i, : len(seq)] = torch.tensor(seq)
+
+        return self.batch_decode(padded_output_sequences, skip_special_tokens=skip_special_tokens, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+
+        # Make a copy of list when removing otherwise `self.image_processor.model_input_names` is also modified
+        extra_image_inputs = [
+            "image_input_ids",
+            "image_patch_indices_per_subsequence",
+            "images",
+            "image_patch_indices_per_batch",
+        ]
+        image_processor_input_names = [name for name in image_processor_input_names if name not in extra_image_inputs]
+        return list(tokenizer_input_names + image_processor_input_names + ["image_patches_indices"])
+
+
+__all__ = ["FuyuProcessor"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/gemma3/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/gemma3/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..37ec82f91037a6988291c99971b27964b33e5193
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/gemma3/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_gemma3 import *
+    from .image_processing_gemma3 import *
+    from .image_processing_gemma3_fast import *
+    from .modeling_gemma3 import *
+    from .processing_gemma3 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/gemma3/configuration_gemma3.py b/venv/lib/python3.13/site-packages/transformers/models/gemma3/configuration_gemma3.py
new file mode 100644
index 0000000000000000000000000000000000000000..15d055654b11be82d9a1fead45c87dafc915487c
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/gemma3/configuration_gemma3.py
@@ -0,0 +1,340 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/gemma3/modular_gemma3.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_gemma3.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Optional, Union
+
+from ...configuration_utils import PretrainedConfig, layer_type_validation
+from ...modeling_rope_utils import rope_config_validation
+from ...utils import logging
+from ..siglip import SiglipVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class Gemma3TextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Gemma3TextModel`]. It is used to instantiate an Gemma3Text
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Gemma3Text-7B.
+    e.g. [google/gemma3_text-7b](https://huggingface.co/google/gemma3_text-7b)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 262208):
+            Vocabulary size of the Gemma3Text model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Gemma3TextModel`]
+        hidden_size (`int`, *optional*, defaults to 2304):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 9216):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 26):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 4):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        head_dim (`int`, *optional*, defaults to 256):
+            The attention head dimension.
+        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
+            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
+        max_position_embeddings (`int`, *optional*, defaults to 131072):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        eos_token_id (`int`, *optional*, defaults to 1):
+            End of stream token id.
+        bos_token_id (`int`, *optional*, defaults to 2):
+            Beginning of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE embeddings.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
+            Scaling factor used on the attention scores
+        sliding_window (`int`, *optional*, defaults to 4096):
+            In Gemma3Text, every other layer uses sliding window attention. This is the size of the sliding window.
+        layer_types (`list`, *optional*):
+            Attention pattern for each layer.
+        final_logit_softcapping (`float`, *optional*):
+            Scaling factor when applying tanh softcapping on the logits.
+        attn_logit_softcapping (`float`, *optional*):
+            Scaling factor when applying tanh softcapping on the attention scores.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings used in global attention. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        rope_local_base_freq (float, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings for local attention.
+        use_bidirectional_attention (`bool`, *optional*, defaults to `False`): If True, the model will attend to all
+            text tokens instead of using a causal mask. This does not change behavior for vision tokens.
+
+    ```python
+    >>> from transformers import Gemma3TextModel, Gemma3TextConfig
+    >>> # Initializing a Gemma3Text gemma3_text-7b style configuration
+    >>> configuration = Gemma3TextConfig()
+    >>> # Initializing a model from the gemma3_text-7b style configuration
+    >>> model = Gemma3TextModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "gemma3_text"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=262_208,
+        hidden_size=2304,
+        intermediate_size=9216,
+        num_hidden_layers=26,
+        num_attention_heads=8,
+        num_key_value_heads=4,
+        head_dim=256,
+        hidden_activation="gelu_pytorch_tanh",
+        max_position_embeddings=131_072,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        eos_token_id=1,
+        bos_token_id=2,
+        tie_word_embeddings=True,
+        rope_theta=1_000_000.0,
+        attention_bias=False,
+        attention_dropout=0.0,
+        query_pre_attn_scalar=256,
+        sliding_window=4096,
+        layer_types=None,
+        final_logit_softcapping=None,
+        attn_logit_softcapping=None,
+        rope_scaling=None,
+        rope_local_base_freq=10_000.0,
+        use_bidirectional_attention=False,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.head_dim = head_dim
+        self.num_key_value_heads = num_key_value_heads
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.hidden_activation = hidden_activation
+        self.query_pre_attn_scalar = query_pre_attn_scalar
+        self.sliding_window = sliding_window
+        self.final_logit_softcapping = final_logit_softcapping
+        self.attn_logit_softcapping = attn_logit_softcapping
+        self.layer_types = layer_types
+        self.use_bidirectional_attention = use_bidirectional_attention
+        if use_bidirectional_attention:
+            self.sliding_window = (self.sliding_window // 2) + 1  # due to fa we set exclusive bounds
+
+        self.rope_local_base_freq = rope_local_base_freq
+        self.rope_scaling = rope_scaling
+        rope_config_validation(self)
+
+        # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
+        self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 6)
+
+        if self.layer_types is None:
+            self.layer_types = [
+                "sliding_attention" if bool((i + 1) % self._sliding_window_pattern) else "full_attention"
+                for i in range(self.num_hidden_layers)
+            ]
+        layer_type_validation(self.layer_types, self.num_hidden_layers)
+
+
+class Gemma3Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Gemma3ForConditionalGeneration`]. It is used to instantiate an
+    Gemma3ForConditionalGeneration according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the PaliGemma-2B.
+
+    e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`Union[Gemma3TextConfig, dict]`, *optional*):
+            The config object of the text backbone.
+        vision_config (`Union[AutoConfig, dict]`,  *optional*):
+            Custom vision config or dict.
+        mm_tokens_per_image (`int`, *optional*, defaults to 256):
+            The number of tokens per image embedding.
+        boi_token_index (`int`, *optional*, defaults to 255999):
+            The begin-of-image token index to wrap the image prompt.
+        eoi_token_index (`int`, *optional*, defaults to 256000):
+            The end-of-image token index to wrap the image prompt.
+        image_token_index (`int`, *optional*, defaults to 262144):
+            The image token index to encode the image prompt.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
+
+    Example:
+
+    ```python
+    >>> from transformers import Gemma3ForConditionalGeneration, Gemma3Config, SiglipVisionConfig, Gemma3TextConfig
+
+    >>> # Initializing a Siglip-like vision config
+    >>> vision_config = SiglipVisionConfig()
+
+    >>> # Initializing a Gemma3 Text config
+    >>> text_config = Gemma3TextConfig()
+
+    >>> # Initializing a Gemma3 gemma-3-4b style configuration
+    >>> configuration = Gemma3Config(vision_config, text_config)
+
+    >>> # Initializing a model from the gemma-3-4b style configuration
+    >>> model = Gemma3TextConfig(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "gemma3"
+    attribute_map = {
+        "image_token_id": "image_token_index",
+        "boi_token_id": "boi_token_index",
+        "eoi_token_id": "eoi_token_index",
+    }
+    sub_configs = {
+        "text_config": Gemma3TextConfig,
+        "vision_config": SiglipVisionConfig,
+    }
+
+    def __init__(
+        self,
+        text_config: Optional[Union[Gemma3TextConfig, dict[str, Any]]] = None,
+        vision_config: Optional[Union[SiglipVisionConfig, dict[str, Any]]] = None,
+        mm_tokens_per_image: int = 256,
+        boi_token_index: int = 255_999,
+        eoi_token_index: int = 256_000,
+        image_token_index: int = 262_144,
+        initializer_range: float = 0.02,
+        **kwargs,
+    ):
+        if text_config is None:
+            text_config = Gemma3TextConfig()
+            logger.info("text_config is None, using default Gemma3TextConfig text config.")
+        elif isinstance(text_config, dict):
+            text_config = Gemma3TextConfig(**text_config)
+
+        if isinstance(vision_config, dict):
+            vision_config = SiglipVisionConfig(**vision_config)
+        elif vision_config is None:
+            vision_config = SiglipVisionConfig()
+            logger.info("vision_config is None, using default SiglipVisionConfig vision config.")
+
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.mm_tokens_per_image = mm_tokens_per_image
+        self.boi_token_index = boi_token_index
+        self.eoi_token_index = eoi_token_index
+        self.image_token_index = image_token_index
+        self.initializer_range = initializer_range
+
+        super().__init__(**kwargs)
+
+
+__all__ = ["Gemma3Config", "Gemma3TextConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/gemma3/image_processing_gemma3.py b/venv/lib/python3.13/site-packages/transformers/models/gemma3/image_processing_gemma3.py
new file mode 100644
index 0000000000000000000000000000000000000000..8addbbfd378ce2e9656f40a923fa7fa5faf72a7b
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/gemma3/image_processing_gemma3.py
@@ -0,0 +1,409 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Gemma3."""
+
+import itertools
+import math
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    convert_to_rgb,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_flat_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_vision_available():
+    import PIL
+
+
+class Gemma3ImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a SigLIP image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in the `preprocess` method.
+        size (`dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of the image after resizing. Can be overridden by `size` in the `preprocess` method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image by the specified mean and standard deviation. Can be overridden by
+            `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        do_pan_and_scan (`bool`, *optional*):
+            Whether to apply `pan_and_scan` to images.
+        pan_and_scan_min_crop_size (`int`, *optional*):
+            Minimum size of each crop in pan and scan.
+        pan_and_scan_max_num_crops (`int`, *optional*):
+            Maximum number of crops per image in pan and scan.
+        pan_and_scan_min_ratio_to_activate (`float`, *optional*):
+            Minimum aspect ratio to activate pan and scan.
+    """
+
+    model_input_names = ["pixel_values", "num_crops"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: Optional[bool] = True,
+        do_pan_and_scan: Optional[bool] = None,
+        pan_and_scan_min_crop_size: Optional[int] = None,
+        pan_and_scan_max_num_crops: Optional[int] = None,
+        pan_and_scan_min_ratio_to_activate: Optional[float] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 224, "width": 224}
+        size = get_size_dict(size, default_to_square=True)
+        image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+        self.do_pan_and_scan = do_pan_and_scan
+        self.pan_and_scan_min_crop_size = pan_and_scan_min_crop_size
+        self.pan_and_scan_max_num_crops = pan_and_scan_max_num_crops
+        self.pan_and_scan_min_ratio_to_activate = pan_and_scan_min_ratio_to_activate
+
+    def pan_and_scan(
+        self,
+        image: np.ndarray,
+        pan_and_scan_min_crop_size: int,
+        pan_and_scan_max_num_crops: int,
+        pan_and_scan_min_ratio_to_activate: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Pan and Scan and image, by cropping into smaller images when the aspect ratio exceeds
+        minimum allowed ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            pan_and_scan_min_crop_size (`int`, *optional*):
+                Minimum size of each crop in pan and scan.
+            pan_and_scan_max_num_crops (`int`, *optional*):
+                Maximum number of crops per image in pan and scan.
+            pan_and_scan_min_ratio_to_activate (`float`, *optional*):
+                Minimum aspect ratio to activate pan and scan.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        height, width = get_image_size(image)
+
+        # Square or landscape image.
+        if width >= height:
+            # Only apply PaS if the image is sufficiently exaggerated
+            if width / height < pan_and_scan_min_ratio_to_activate:
+                return []
+
+            # Select ideal number of crops close to the image aspect ratio and such that crop_size > min_crop_size.
+            num_crops_w = int(math.floor(width / height + 0.5))  # Half round up rounding.
+            num_crops_w = min(int(math.floor(width / pan_and_scan_min_crop_size)), num_crops_w)
+
+            # Make sure the number of crops is in range [2, pan_and_scan_max_num_crops].
+            num_crops_w = max(2, num_crops_w)
+            num_crops_w = min(pan_and_scan_max_num_crops, num_crops_w)
+            num_crops_h = 1
+
+        # Portrait image.
+        else:
+            # Only apply PaS if the image is sufficiently exaggerated
+            if height / width < pan_and_scan_min_ratio_to_activate:
+                return []
+
+            # Select ideal number of crops close to the image aspect ratio and such that crop_size > min_crop_size.
+            num_crops_h = int(math.floor(height / width + 0.5))
+            num_crops_h = min(int(math.floor(height / pan_and_scan_min_crop_size)), num_crops_h)
+
+            # Make sure the number of crops is in range [2, pan_and_scan_max_num_crops].
+            num_crops_h = max(2, num_crops_h)
+            num_crops_h = min(pan_and_scan_max_num_crops, num_crops_h)
+            num_crops_w = 1
+
+        crop_size_w = int(math.ceil(width / num_crops_w))
+        crop_size_h = int(math.ceil(height / num_crops_h))
+
+        # Don't apply PaS if crop size is too small.
+        if min(crop_size_w, crop_size_h) < pan_and_scan_min_crop_size:
+            return []
+
+        crop_positions_w = [crop_size_w * i for i in range(num_crops_w)]
+        crop_positions_h = [crop_size_h * i for i in range(num_crops_h)]
+
+        if input_data_format == ChannelDimension.LAST:
+            image_crops = [
+                image[pos_h : pos_h + crop_size_h, pos_w : pos_w + crop_size_w]
+                for pos_h, pos_w in itertools.product(crop_positions_h, crop_positions_w)
+            ]
+        else:
+            image_crops = [
+                image[:, pos_h : pos_h + crop_size_h, pos_w : pos_w + crop_size_w]
+                for pos_h, pos_w in itertools.product(crop_positions_h, crop_positions_w)
+            ]
+
+        return image_crops
+
+    def _process_images_for_pan_and_scan(
+        self,
+        images: list[np.ndarray],
+        do_pan_and_scan: bool,
+        pan_and_scan_min_crop_size: int,
+        pan_and_scan_max_num_crops: int,
+        pan_and_scan_min_ratio_to_activate: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        pas_images_list = []
+        num_crops = []
+        for image in images:
+            pas_images = self.pan_and_scan(
+                image=image,
+                pan_and_scan_min_crop_size=pan_and_scan_min_crop_size,
+                pan_and_scan_max_num_crops=pan_and_scan_max_num_crops,
+                pan_and_scan_min_ratio_to_activate=pan_and_scan_min_ratio_to_activate,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            pas_images_list.extend([image] + pas_images)
+            num_crops.append(len(pas_images))
+        return pas_images_list, num_crops
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        do_pan_and_scan: Optional[bool] = None,
+        pan_and_scan_min_crop_size: Optional[int] = None,
+        pan_and_scan_max_num_crops: Optional[int] = None,
+        pan_and_scan_min_ratio_to_activate: Optional[float] = None,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            do_pan_and_scan (`bool`, *optional*, defaults to `self.do_pan_and_scan`):
+                Whether to apply `pan_and_scan` to images.
+            pan_and_scan_min_crop_size (`int`, *optional*, defaults to `self.pan_and_scan_min_crop_size`):
+                Minimum size of each crop in pan and scan.
+            pan_and_scan_max_num_crops (`int`, *optional*, defaults to `self.pan_and_scan_max_num_crops`):
+                Maximum number of crops per image in pan and scan.
+            pan_and_scan_min_ratio_to_activate (`float`, *optional*, defaults to `self.pan_and_scan_min_ratio_to_activate`):
+                Minimum aspect ratio to activate pan and scan.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, param_name="size", default_to_square=False)
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        do_pan_and_scan = do_pan_and_scan if do_pan_and_scan is not None else self.do_pan_and_scan
+        pan_and_scan_min_crop_size = (
+            pan_and_scan_min_crop_size if pan_and_scan_min_crop_size is not None else self.pan_and_scan_min_crop_size
+        )
+        pan_and_scan_max_num_crops = (
+            pan_and_scan_max_num_crops if pan_and_scan_max_num_crops is not None else self.pan_and_scan_max_num_crops
+        )
+        pan_and_scan_min_ratio_to_activate = (
+            pan_and_scan_min_ratio_to_activate
+            if pan_and_scan_min_ratio_to_activate is not None
+            else self.pan_and_scan_min_ratio_to_activate
+        )
+
+        images = self.fetch_images(images)
+        images = make_flat_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if do_pan_and_scan:
+            images, num_crops = self._process_images_for_pan_and_scan(
+                images=images,
+                do_pan_and_scan=do_pan_and_scan,
+                pan_and_scan_min_crop_size=pan_and_scan_min_crop_size,
+                pan_and_scan_max_num_crops=pan_and_scan_max_num_crops,
+                pan_and_scan_min_ratio_to_activate=pan_and_scan_min_ratio_to_activate,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+
+        else:
+            num_crops = [0 for _ in images]
+
+        processed_images = []
+        for image in images:
+            if do_resize:
+                height, width = size["height"], size["width"]
+                image = resize(
+                    image=image, size=(height, width), resample=resample, input_data_format=input_data_format
+                )
+
+            if do_rescale:
+                image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            processed_images.append(image)
+
+        data = {"pixel_values": processed_images, "num_crops": num_crops}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["Gemma3ImageProcessor"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/gemma3/image_processing_gemma3_fast.py b/venv/lib/python3.13/site-packages/transformers/models/gemma3/image_processing_gemma3_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..c61152bc6b222b969bfcbe27f12742968277d6d7
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/gemma3/image_processing_gemma3_fast.py
@@ -0,0 +1,258 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for SigLIP."""
+
+import itertools
+import math
+from typing import Optional, Union
+
+import torch
+from torchvision.transforms.v2 import functional as F
+
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    BatchFeature,
+    DefaultFastImageProcessorKwargs,
+    group_images_by_shape,
+    reorder_images,
+)
+from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ImageInput, PILImageResampling, SizeDict
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    auto_docstring,
+    logging,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+class Gemma3FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    """
+    do_pan_and_scan (`bool`, *optional*):
+        Whether to apply `pan_and_scan` to images.
+    pan_and_scan_min_crop_size (`int`, *optional*):
+        Minimum size of each crop in pan and scan.
+    pan_and_scan_max_num_crops (`int`, *optional*):
+        Maximum number of crops per image in pan and scan.
+    pan_and_scan_min_ratio_to_activate (`float`, *optional*):
+        Minimum aspect ratio to activate pan and scan.
+    """
+
+    do_pan_and_scan: Optional[bool]
+    pan_and_scan_min_crop_size: Optional[int]
+    pan_and_scan_max_num_crops: Optional[int]
+    pan_and_scan_min_ratio_to_activate: Optional[float]
+
+
+@auto_docstring
+class Gemma3ImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BILINEAR
+    image_mean = IMAGENET_STANDARD_MEAN
+    image_std = IMAGENET_STANDARD_STD
+    size = {"height": 224, "width": 224}
+    default_to_square = True
+    do_convert_rgb = True
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    do_pan_and_scan = None
+    pan_and_scan_min_crop_size = None
+    pan_and_scan_max_num_crops = None
+    pan_and_scan_min_ratio_to_activate = None
+    valid_kwargs = Gemma3FastImageProcessorKwargs
+
+    def __init__(self, **kwargs: Unpack[Gemma3FastImageProcessorKwargs]):
+        super().__init__(**kwargs)
+
+    def pan_and_scan_batched(
+        self,
+        images: "torch.Tensor",
+        pan_and_scan_min_crop_size: int,
+        pan_and_scan_max_num_crops: int,
+        pan_and_scan_min_ratio_to_activate: float,
+    ):
+        """
+        Pan and Scan an image, by cropping into smaller images when the aspect ratio exceeds
+        minimum allowed ratio.
+
+        Args:
+            image (`torch.Tensor`):
+                Image to resize.
+            pan_and_scan_min_crop_size (`int`, *optional*):
+                Minimum size of each crop in pan and scan.
+            pan_and_scan_max_num_crops (`int`, *optional*):
+                Maximum number of crops per image in pan and scan.
+            pan_and_scan_min_ratio_to_activate (`float`, *optional*):
+                Minimum aspect ratio to activate pan and scan.
+        """
+        height, width = images.shape[-2:]
+
+        # Square or landscape image.
+        if width >= height:
+            # Only apply PaS if the image is sufficiently exaggerated
+            if width / height < pan_and_scan_min_ratio_to_activate:
+                return []
+
+            # Select ideal number of crops close to the image aspect ratio and such that crop_size > min_crop_size.
+            num_crops_w = int(math.floor(width / height + 0.5))  # Half round up rounding.
+            num_crops_w = min(int(math.floor(width / pan_and_scan_min_crop_size)), num_crops_w)
+
+            # Make sure the number of crops is in range [2, pan_and_scan_max_num_crops].
+            num_crops_w = max(2, num_crops_w)
+            num_crops_w = min(pan_and_scan_max_num_crops, num_crops_w)
+            num_crops_h = 1
+
+        # Portrait image.
+        else:
+            # Only apply PaS if the image is sufficiently exaggerated
+            if height / width < pan_and_scan_min_ratio_to_activate:
+                return []
+
+            # Select ideal number of crops close to the image aspect ratio and such that crop_size > min_crop_size.
+            num_crops_h = int(math.floor(height / width + 0.5))
+            num_crops_h = min(int(math.floor(height / pan_and_scan_min_crop_size)), num_crops_h)
+
+            # Make sure the number of crops is in range [2, pan_and_scan_max_num_crops].
+            num_crops_h = max(2, num_crops_h)
+            num_crops_h = min(pan_and_scan_max_num_crops, num_crops_h)
+            num_crops_w = 1
+
+        crop_size_w = int(math.ceil(width / num_crops_w))
+        crop_size_h = int(math.ceil(height / num_crops_h))
+
+        # Don't apply PaS if crop size is too small.
+        if min(crop_size_w, crop_size_h) < pan_and_scan_min_crop_size:
+            return []
+
+        crop_positions_w = [crop_size_w * i for i in range(num_crops_w)]
+        crop_positions_h = [crop_size_h * i for i in range(num_crops_h)]
+
+        return [
+            images[..., pos_h : pos_h + crop_size_h, pos_w : pos_w + crop_size_w]
+            for pos_h, pos_w in itertools.product(crop_positions_h, crop_positions_w)
+        ]
+
+    def _process_images_for_pan_and_scan(
+        self,
+        images: list["torch.Tensor"],
+        do_pan_and_scan: bool,
+        pan_and_scan_min_crop_size: int,
+        pan_and_scan_max_num_crops: int,
+        pan_and_scan_min_ratio_to_activate: float,
+    ):
+        pas_images = self.pan_and_scan_batched(
+            images=images,
+            pan_and_scan_min_crop_size=pan_and_scan_min_crop_size,
+            pan_and_scan_max_num_crops=pan_and_scan_max_num_crops,
+            pan_and_scan_min_ratio_to_activate=pan_and_scan_min_ratio_to_activate,
+        )
+        num_crops = [len(pas_images) for _ in images]
+        return pas_images, num_crops
+
+    @auto_docstring
+    def preprocess(
+        self,
+        images: ImageInput,
+        **kwargs: Unpack[Gemma3FastImageProcessorKwargs],
+    ) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+
+    def _preprocess(
+        self,
+        images: list[list["torch.Tensor"]],
+        do_resize: bool,
+        size: SizeDict,
+        do_pan_and_scan: Optional[bool],
+        pan_and_scan_min_crop_size: Optional[int],
+        pan_and_scan_max_num_crops: Optional[int],
+        pan_and_scan_min_ratio_to_activate: Optional[float],
+        interpolation: Optional["F.InterpolationMode"],
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> BatchFeature:
+        # Group images by size for batched processing
+        processed_images_grouped = {}
+        num_crops_grouped = {}
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        for shape_images, stacked_images in grouped_images.items():
+            if do_pan_and_scan:
+                pas_images, num_crops = self._process_images_for_pan_and_scan(
+                    images=stacked_images,
+                    do_pan_and_scan=do_pan_and_scan,
+                    pan_and_scan_min_crop_size=pan_and_scan_min_crop_size,
+                    pan_and_scan_max_num_crops=pan_and_scan_max_num_crops,
+                    pan_and_scan_min_ratio_to_activate=pan_and_scan_min_ratio_to_activate,
+                )
+                # Add the thumbnails to the image patches
+                stacked_images = [stacked_images] + pas_images
+                # Group images by size for batched resizing (this will typically group thumbnails together and cropped patches together)
+                processed_image_patches_grouped = {}
+                grouped_image_patches, grouped_image_patches_index = group_images_by_shape(
+                    stacked_images, disable_grouping=disable_grouping
+                )
+                for shape, stacked_image_patches in grouped_image_patches.items():
+                    stacked_image_patches = self.resize(
+                        image=stacked_image_patches,
+                        size=size,
+                        interpolation=interpolation,
+                    )
+                    processed_image_patches_grouped[shape] = stacked_image_patches
+                processed_image_patches = reorder_images(processed_image_patches_grouped, grouped_image_patches_index)
+                # Transpose to have the thumbnails with their corresponding patches
+                stacked_images = torch.stack(processed_image_patches, dim=0).transpose(0, 1).contiguous()
+            else:
+                num_crops = [0 for _ in stacked_images]
+
+                if do_resize:
+                    stacked_images = self.resize(
+                        image=stacked_images,
+                        size=size,
+                        interpolation=interpolation,
+                    )
+            num_crops_grouped[shape_images] = num_crops
+            processed_images_grouped[shape_images] = stacked_images
+        resized_images = reorder_images(processed_images_grouped, grouped_images_index)
+        # If pan and scan is enabled, we need to flatten the list of images
+        if do_pan_and_scan:
+            resized_images = [image for images_list in resized_images for image in images_list]
+        num_crops = reorder_images(num_crops_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+        return BatchFeature(
+            data={"pixel_values": processed_images, "num_crops": num_crops}, tensor_type=return_tensors
+        )
+
+
+__all__ = ["Gemma3ImageProcessorFast"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/gemma3/modeling_gemma3.py b/venv/lib/python3.13/site-packages/transformers/models/gemma3/modeling_gemma3.py
new file mode 100644
index 0000000000000000000000000000000000000000..49e9f6d421fb4b7e526a485027b0aeb5b66eac68
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/gemma3/modeling_gemma3.py
@@ -0,0 +1,1341 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/gemma3/modular_gemma3.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_gemma3.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+from collections.abc import Callable
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...configuration_utils import PretrainedConfig
+from ...generation import GenerationMixin
+from ...masking_utils import create_causal_mask, create_masks_for_generate, create_sliding_window_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import check_model_inputs
+from ..auto import AutoModel
+from .configuration_gemma3 import Gemma3Config, Gemma3TextConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Gemma3 outputs, with hidden states and attentions.
+    """
+)
+class Gemma3ModelOutputWithPast(BaseModelOutputWithPast):
+    r"""
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Gemma3 causal language model (or autoregressive) outputs.
+    """
+)
+class Gemma3CausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+class Gemma3TextScaledWordEmbedding(nn.Embedding):
+    """
+    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: float = 1.0):
+        super().__init__(num_embeddings, embedding_dim, padding_idx)
+        self.register_buffer("embed_scale", torch.tensor(embed_scale), persistent=False)
+
+    def forward(self, input_ids: torch.Tensor):
+        return super().forward(input_ids) * self.embed_scale.to(self.weight.dtype)
+
+
+class Gemma3MLP(nn.Module):
+    def __init__(self, config: Gemma3TextConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_activation]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+class Gemma3RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.zeros(dim))
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        output = self._norm(x.float())
+        # Llama does x.to(float16) * w whilst Gemma3 is (x * w).to(float16)
+        # See https://github.com/huggingface/transformers/pull/29402
+        output = output * (1.0 + self.weight.float())
+        return output.type_as(x)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.eps}"
+
+
+class Gemma3RotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: Gemma3TextConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    dropout: float = 0.0,
+    scaling: Optional[float] = None,
+    softcap: Optional[float] = None,
+    **kwargs,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    if scaling is None:
+        scaling = module.head_dim**-0.5
+
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+
+    if softcap is not None:
+        attn_weights = attn_weights / softcap
+        attn_weights = torch.tanh(attn_weights)
+        attn_weights = attn_weights * softcap
+    if attention_mask is not None:  # no matter the length, we just slice it
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    # upcast attention to fp32
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+
+
+class Gemma3Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Gemma3TextConfig, layer_idx: int):
+        super().__init__()
+        self.is_sliding = config.layer_types[layer_idx] == "sliding_attention"
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = config.query_pre_attn_scalar**-0.5
+        self.attention_dropout = self.config.attention_dropout
+        self.is_causal = not self.config.use_bidirectional_attention
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        self.attn_logit_softcapping = self.config.attn_logit_softcapping
+        self.sliding_window = config.sliding_window if self.is_sliding else None
+
+        self.q_norm = Gemma3RMSNorm(dim=config.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = Gemma3RMSNorm(dim=config.head_dim, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        query_states = self.q_norm(query_states)
+        key_states = self.k_norm(key_states)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=self.attention_dropout if self.training else 0.0,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Gemma3DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Gemma3TextConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.layer_idx = layer_idx
+        self.attention_type = config.layer_types[layer_idx]
+        self.self_attn = Gemma3Attention(config=config, layer_idx=layer_idx)
+        self.mlp = Gemma3MLP(config)
+        self.input_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.pre_feedforward_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.post_feedforward_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings_global: torch.Tensor,
+        position_embeddings_local: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # apply global RoPE to non-sliding layer only
+        if self.self_attn.is_sliding:
+            position_embeddings = position_embeddings_local
+        else:
+            position_embeddings = position_embeddings_global
+
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.pre_feedforward_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+@auto_docstring
+class Gemma3PreTrainedModel(PreTrainedModel):
+    config: Gemma3Config
+    base_model_prefix = ""
+    supports_gradient_checkpointing = True
+    _no_split_modules = [
+        "Gemma3DecoderLayer",
+        "SiglipVisionEmbeddings",
+        "SiglipEncoderLayer",
+        "SiglipMultiheadAttentionPoolingHead",
+    ]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": Gemma3DecoderLayer,
+        "attentions": Gemma3Attention,
+    }
+
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, Gemma3MultiModalProjector):
+            module.mm_input_projection_weight.data.zero_()
+        # We initialize with 0s to be 1 centered as the RMSNorm here does (1 + weight)
+        elif "RMSNorm" in module.__class__.__name__:
+            module.weight.data.zero_()
+
+
+def _bidirectional_window_overlay(sliding_window: int) -> Callable[[int, int, int, int], bool]:
+    """
+    Enables a bidirectional mask within the sliding window.
+    """
+
+    def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+        """A token can attend to any other token if their absolute distance is within
+        the (exclusive) sliding window size (distance < sliding_window)."""
+        return abs(q_idx - kv_idx) < sliding_window
+
+    return inner_mask
+
+
+@auto_docstring
+class Gemma3TextModel(Gemma3PreTrainedModel):
+    config: Gemma3TextConfig
+
+    def __init__(self, config: Gemma3TextConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        # Gemma3 downcasts the below to bfloat16, causing sqrt(3072)=55.4256 to become 55.5. See https://github.com/huggingface/transformers/pull/29402
+        self.embed_tokens = Gemma3TextScaledWordEmbedding(
+            config.vocab_size, config.hidden_size, self.padding_idx, embed_scale=self.config.hidden_size**0.5
+        )
+        self.layers = nn.ModuleList(
+            [Gemma3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Gemma3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Gemma3RotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # TODO: raushan fix this after RoPE refactor. For now we hack it by reassigning thetas
+        # when we want to create a local RoPE layer. Config defaults should hold values for global RoPE
+        config = copy.deepcopy(config)
+        config.rope_theta = config.rope_local_base_freq
+        config.rope_scaling = {"rope_type": "default"}
+        self.rotary_emb_local = Gemma3RotaryEmbedding(config=config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs()
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None and not self.training:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens,
+                past_seen_tokens + inputs_embeds.shape[1],
+                device=inputs_embeds.device,
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            # Prepare mask arguments
+            mask_kwargs = {
+                "config": self.config,
+                "input_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            sliding_mask_kwargs = mask_kwargs.copy()
+
+            if self.config.use_bidirectional_attention:
+                mask_kwargs["or_mask_function"] = lambda *args: torch.tensor(True, dtype=torch.bool)
+                sliding_mask_kwargs["or_mask_function"] = _bidirectional_window_overlay(self.config.sliding_window)
+
+            # Create the masks
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+                "sliding_attention": create_sliding_window_causal_mask(**sliding_mask_kwargs),
+            }
+
+        # embed positions
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings_global = self.rotary_emb(hidden_states, position_ids)
+        position_embeddings_local = self.rotary_emb_local(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                position_embeddings_global=position_embeddings_global,
+                position_embeddings_local=position_embeddings_local,
+                attention_mask=causal_mask_mapping[decoder_layer.attention_type],
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                **kwargs,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+@auto_docstring
+class Gemma3ForCausalLM(Gemma3PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    config: Gemma3TextConfig
+    base_model_prefix = "language_model"
+
+    def __init__(self, config: Gemma3TextConfig):
+        super().__init__(config)
+        self.model = Gemma3TextModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Gemma3ForCausalLM
+
+        >>> model = Gemma3ForCausalLM.from_pretrained("google/gemma-2-9b")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
+
+        >>> prompt = "What is your favorite condiment?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "What is your favorite condiment?"
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        if self.config.final_logit_softcapping is not None:
+            logits = logits / self.config.final_logit_softcapping
+            logits = torch.tanh(logits)
+            logits = logits * self.config.final_logit_softcapping
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class Gemma3MultiModalProjector(nn.Module):
+    def __init__(self, config: Gemma3Config):
+        super().__init__()
+
+        self.mm_input_projection_weight = nn.Parameter(
+            torch.zeros(config.vision_config.hidden_size, config.text_config.hidden_size)
+        )
+
+        self.mm_soft_emb_norm = Gemma3RMSNorm(
+            config.vision_config.hidden_size, eps=config.vision_config.layer_norm_eps
+        )
+
+        self.patches_per_image = int(config.vision_config.image_size // config.vision_config.patch_size)
+        self.tokens_per_side = int(config.mm_tokens_per_image**0.5)
+        self.kernel_size = self.patches_per_image // self.tokens_per_side
+        self.avg_pool = nn.AvgPool2d(kernel_size=self.kernel_size, stride=self.kernel_size)
+
+    def forward(self, vision_outputs: torch.Tensor):
+        batch_size, _, seq_length = vision_outputs.shape
+
+        reshaped_vision_outputs = vision_outputs.transpose(1, 2)
+        reshaped_vision_outputs = reshaped_vision_outputs.reshape(
+            batch_size, seq_length, self.patches_per_image, self.patches_per_image
+        )
+        reshaped_vision_outputs = reshaped_vision_outputs.contiguous()
+
+        pooled_vision_outputs = self.avg_pool(reshaped_vision_outputs)
+        pooled_vision_outputs = pooled_vision_outputs.flatten(2)
+        pooled_vision_outputs = pooled_vision_outputs.transpose(1, 2)
+
+        normed_vision_outputs = self.mm_soft_emb_norm(pooled_vision_outputs)
+
+        projected_vision_outputs = torch.matmul(normed_vision_outputs, self.mm_input_projection_weight)
+        return projected_vision_outputs.type_as(vision_outputs)
+
+
+def token_type_ids_mask_function(
+    token_type_ids: Optional[torch.Tensor],
+    image_group_ids: Optional[torch.Tensor],
+    tokens_per_image: int,
+) -> Optional[Callable]:
+    """
+    This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
+    not start and end indices.
+    """
+    # Do not return an additional mask in this case
+    if token_type_ids is None:
+        return None
+
+    def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+        # If it's 1 for both query and key/value, we are in an image block
+        # NOTE: static cache shape goes beyond input seq length, while token_type_ids.shape[1] == input seq length
+        # Since vmap doesn't support `if statement` we workaround it with `torch.where`
+        safe_idx = torch.where(kv_idx < token_type_ids.shape[1], kv_idx, 0)
+        token_type_ids_at_kv_idx = token_type_ids[batch_idx, safe_idx]
+        token_type_ids_at_kv_idx = torch.where(kv_idx < token_type_ids.shape[1], token_type_ids_at_kv_idx, 0)
+
+        image_group_ids_at_kv_idx = image_group_ids[batch_idx, safe_idx]
+        image_group_ids_at_kv_idx = torch.where(kv_idx < image_group_ids.shape[1], image_group_ids_at_kv_idx, -1)
+
+        is_image_block = (token_type_ids[batch_idx, q_idx] == 1) & (token_type_ids_at_kv_idx == 1)
+        same_image_block = image_group_ids[batch_idx, q_idx] == image_group_ids_at_kv_idx
+
+        # This is bidirectional attention whenever we are dealing with image tokens
+        return is_image_block & same_image_block
+
+    return inner_mask
+
+
+@auto_docstring(
+    custom_intro="""
+    The Base Gemma3 model which consists of a vision backbone and a language model without language modeling head.,
+    """
+)
+class Gemma3Model(Gemma3PreTrainedModel):
+    _checkpoint_conversion_mapping = {"language_model.model": "language_model"}
+    # we are filtering the logits/labels so we shouldn't divide the loss based on num_items_in_batch
+    accepts_loss_kwargs = False
+
+    def __init__(self, config: Gemma3Config):
+        super().__init__(config)
+        self.vision_tower = AutoModel.from_config(config=config.vision_config)
+        self.multi_modal_projector = Gemma3MultiModalProjector(config)
+        self.vocab_size = config.text_config.vocab_size
+
+        language_model = AutoModel.from_config(config=config.text_config)
+        self.language_model = language_model
+
+        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+
+    def get_decoder(self):
+        return self.language_model
+
+    def get_image_features(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        """
+        Projects the last hidden state from the vision model into language model space.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
+               The tensors corresponding to the input images.
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        vision_outputs = self.vision_tower(pixel_values=pixel_values).last_hidden_state
+        image_features = self.multi_modal_projector(vision_outputs)
+        return image_features
+
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        n_image_features = image_features.shape[0] * image_features.shape[1]
+        if inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+            )
+        return special_image_mask
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **lm_kwargs,
+    ) -> Union[tuple, Gemma3ModelOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration
+
+        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma32-3b-mix-224")
+        >>> processor = AutoProcessor.from_pretrained("google/gemma32-3b-mix-224")
+
+        >>> prompt = "Where is the cat standing?"
+        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs,)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Where is the cat standing?\nsnow"
+        ```"""
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # Replace image id with PAD if the image token if OOV, to avoid index-errors
+        if input_ids is not None and self.config.image_token_id >= self.vocab_size:
+            special_image_mask = input_ids == self.config.image_token_id
+            llm_input_ids = input_ids.clone()
+            llm_input_ids[special_image_mask] = 0
+        else:
+            llm_input_ids = input_ids
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(llm_input_ids)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        # Merge text and images
+        if pixel_values is not None:
+            image_features = self.get_image_features(pixel_values)
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            # Prepare mask arguments
+            mask_kwargs = {
+                "config": self.config.get_text_config(),
+                "input_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            # NOTE: this `is_prefill` logic is not flawless, it fails when we're using a cache eagerly initialized
+            # (e.g. compiled prefill) AND `pixel_values` are not provided. Determining prefill in that case requires
+            # checking data values, which is not compile-compatible.
+            is_prefill = (
+                not use_cache
+                or past_key_values is None
+                or not past_key_values.is_initialized
+                or pixel_values is not None
+            )
+            if token_type_ids is not None and is_prefill:
+                # We need to pass an additional mask function to account for token type ids, and it needs to be an `or`
+
+                # First find where a new image block starts: 1 if image and previous not image
+                # The images cannot attend to future images, but can attend to all prev images and to itself
+                # bidirectionally
+                is_image = (token_type_ids == 1).to(cache_position.device)
+                new_image_start = is_image & ~nn.functional.pad(is_image, (1, 0), value=0)[:, :-1]
+                image_group_ids = torch.cumsum(new_image_start.int(), dim=1) - 1
+                image_group_ids = torch.where(
+                    is_image, image_group_ids, torch.full_like(token_type_ids, -1, device=is_image.device)
+                )
+                mask_kwargs["or_mask_function"] = token_type_ids_mask_function(
+                    token_type_ids.to(cache_position.device), image_group_ids, self.config.mm_tokens_per_image
+                )
+
+            # Create the masks
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+                "sliding_attention": create_sliding_window_causal_mask(**mask_kwargs),
+            }
+
+        outputs = self.language_model(
+            attention_mask=causal_mask_mapping,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            **lm_kwargs,
+        )
+
+        return Gemma3ModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values if use_cache else None,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The Base Gemma3 model which consists of a vision backbone and a language model without language modeling head.,
+    """
+)
+class Gemma3ForConditionalGeneration(Gemma3PreTrainedModel, GenerationMixin):
+    _checkpoint_conversion_mapping = {
+        "^language_model.model": "model.language_model",
+        "^vision_tower": "model.vision_tower",
+        "^multi_modal_projector": "model.multi_modal_projector",
+        "^language_model.lm_head": "lm_head",
+    }
+    _tied_weights_keys = ["lm_head.weight"]
+    # we are filtering the logits/labels so we shouldn't divide the loss based on num_items_in_batch
+    # Fix: https://github.com/huggingface/transformers/issues/40564
+    accepts_loss_kwargs = False
+
+    def __init__(self, config: Gemma3Config):
+        super().__init__(config)
+        self.model = Gemma3Model(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def get_image_features(self, pixel_values):
+        return self.model.get_image_features(pixel_values)
+
+    # Make modules available through conditional class for BC
+    @property
+    def language_model(self):
+        return self.model.language_model
+
+    @property
+    def vision_tower(self):
+        return self.model.vision_tower
+
+    @property
+    def multi_modal_projector(self):
+        return self.model.multi_modal_projector
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **lm_kwargs,
+    ) -> Union[tuple, Gemma3CausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration
+
+        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
+        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")
+
+        >>> messages = [
+        ...     {
+        ...         "role": "system",
+        ...         "content": [
+        ...             {"type": "text", "text": "You are a helpful assistant."}
+        ...         ]
+        ...     },
+        ...     {
+        ...         "role": "user", "content": [
+        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
+        ...             {"type": "text", "text": "Where is the cat standing?"},
+        ...         ]
+        ...     },
+        ... ]
+
+        >>> inputs = processor.apply_chat_template(
+        ...     messages,
+        ...     tokenize=True,
+        ...     return_dict=True,
+        ...     return_tensors="pt",
+        ...     add_generation_prompt=True
+        ... )
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
+        ```
+        """
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            labels=labels,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            **lm_kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            shift_logits = logits[..., :-1, :]
+            shift_labels = labels[..., 1:]
+            if attention_mask is not None:
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -shift_logits.shape[1] :].to(logits.device)
+                shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
+            else:
+                shift_logits = shift_logits.contiguous()
+                shift_labels = shift_labels.contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+
+            flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
+            flat_labels = shift_labels.view(-1).to(shift_logits.device)
+            loss = loss_fct(flat_logits, flat_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return Gemma3CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        pixel_values=None,
+        attention_mask=None,
+        token_type_ids=None,
+        use_cache=True,
+        logits_to_keep=None,
+        labels=None,
+        **kwargs,
+    ):
+        # Overwritten -- custom `position_ids` and `pixel_values` handling
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            cache_position=cache_position,
+            use_cache=use_cache,
+            logits_to_keep=logits_to_keep,
+            token_type_ids=token_type_ids,
+            **kwargs,
+        )
+
+        # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+        # Otherwise we need pixel values to be passed to model. NOTE: use_cache=False needs pixel_values always
+        if cache_position[0] == 0:
+            model_inputs["pixel_values"] = pixel_values
+
+        return model_inputs
+
+    @staticmethod
+    def create_masks_for_generate(
+        config: PretrainedConfig,
+        input_embeds: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+        cache_position: torch.Tensor,
+        past_key_values: Optional[Cache],
+        position_ids: Optional[torch.Tensor],
+        token_type_ids: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> dict:
+        # Prepare mask arguments
+        mask_kwargs = {
+            "config": config.get_text_config(),
+            "input_embeds": input_embeds,
+            "attention_mask": attention_mask,
+            "cache_position": cache_position,
+            "past_key_values": past_key_values,
+            "position_ids": position_ids,
+        }
+        # Add the token type ids mask for generate as well
+        if token_type_ids is not None and input_embeds.shape[1] != 1:
+            # We need to pass an additional mask function to account for token type ids, and it needs to be an `or`
+
+            # First find where a new image block starts: 1 if image and previous not image
+            # The images cannot attend to future images, but can attend to all prev images and to itself bidirectionally
+            is_image = (token_type_ids == 1).to(cache_position.device)
+            new_image_start = is_image & ~nn.functional.pad(is_image, (1, 0), value=0)[:, :-1]
+            image_group_ids = torch.cumsum(new_image_start.int(), dim=1) - 1
+            image_group_ids = torch.where(is_image, image_group_ids, torch.full_like(token_type_ids, -1))
+            mask_kwargs["or_mask_function"] = token_type_ids_mask_function(
+                token_type_ids.to(cache_position.device), image_group_ids, config.mm_tokens_per_image
+            )
+
+        return create_masks_for_generate(**mask_kwargs)
+
+
+class Gemma3ForSequenceClassification(Gemma3PreTrainedModel):
+    _checkpoint_conversion_mapping = {
+        "^language_model.model": "model.language_model",
+        "^vision_tower": "model.vision_tower",
+        "^multi_modal_projector": "model.multi_modal_projector",
+    }
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Gemma3Model(config)
+        self.score = nn.Linear(config.text_config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> SequenceClassifierOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            pixel_values=pixel_values,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            token_type_ids=token_type_ids,
+            use_cache=use_cache,
+            **kwargs,
+        )
+        hidden_states = transformer_outputs.last_hidden_state
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.text_config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.text_config.pad_token_id is None:
+            last_non_pad_token = -1
+        elif input_ids is not None:
+            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
+            non_pad_mask = (input_ids != self.config.text_config.pad_token_id).to(logits.device, torch.int32)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
+            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
+        else:
+            last_non_pad_token = -1
+            logger.warning_once(
+                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+            )
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+class Gemma3TextForSequenceClassification(GenericForSequenceClassification, Gemma3PreTrainedModel):
+    """
+    Gemma3TextForSequenceClassification is a text-only sequence classification model that works with Gemma3TextConfig.
+    It uses the generic sequence classification implementation for efficiency and consistency.
+    """
+
+    config: Gemma3TextConfig
+
+
+__all__ = [
+    "Gemma3PreTrainedModel",
+    "Gemma3TextModel",
+    "Gemma3ForCausalLM",
+    "Gemma3ForConditionalGeneration",
+    "Gemma3Model",
+    "Gemma3ForSequenceClassification",
+    "Gemma3TextForSequenceClassification",
+]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/gemma3/modular_gemma3.py b/venv/lib/python3.13/site-packages/transformers/models/gemma3/modular_gemma3.py
new file mode 100644
index 0000000000000000000000000000000000000000..22a10f0c8decd55fd2b51e1e66bec2938bd64221
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/gemma3/modular_gemma3.py
@@ -0,0 +1,1221 @@
+# coding=utf-8
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+from collections.abc import Callable
+from typing import Any, Optional, Union
+
+import torch
+import torch.nn as nn
+
+from ...cache_utils import Cache, DynamicCache
+from ...configuration_utils import PretrainedConfig, layer_type_validation
+from ...masking_utils import create_causal_mask, create_masks_for_generate, create_sliding_window_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, SequenceClassifierOutputWithPast
+from ...modeling_rope_utils import rope_config_validation
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ...utils.deprecation import deprecate_kwarg
+from ..gemma2.configuration_gemma2 import Gemma2Config
+from ..gemma2.modeling_gemma2 import (
+    Gemma2Attention,
+    Gemma2ForCausalLM,
+    Gemma2MLP,
+    Gemma2Model,
+    Gemma2PreTrainedModel,
+    Gemma2RMSNorm,
+    Gemma2RotaryEmbedding,
+    apply_rotary_pos_emb,
+    eager_attention_forward,
+)
+from ..paligemma.modeling_paligemma import (
+    PaligemmaCausalLMOutputWithPast,
+    PaliGemmaForConditionalGeneration,
+    PaliGemmaModel,
+    PaligemmaModelOutputWithPast,
+)
+from ..siglip import SiglipVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class Gemma3TextConfig(Gemma2Config, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Gemma3TextModel`]. It is used to instantiate an Gemma3Text
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Gemma3Text-7B.
+    e.g. [google/gemma3_text-7b](https://huggingface.co/google/gemma3_text-7b)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 262208):
+            Vocabulary size of the Gemma3Text model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Gemma3TextModel`]
+        hidden_size (`int`, *optional*, defaults to 2304):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 9216):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 26):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 4):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        head_dim (`int`, *optional*, defaults to 256):
+            The attention head dimension.
+        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
+            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
+        max_position_embeddings (`int`, *optional*, defaults to 131072):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        eos_token_id (`int`, *optional*, defaults to 1):
+            End of stream token id.
+        bos_token_id (`int`, *optional*, defaults to 2):
+            Beginning of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE embeddings.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
+            Scaling factor used on the attention scores
+        sliding_window (`int`, *optional*, defaults to 4096):
+            In Gemma3Text, every other layer uses sliding window attention. This is the size of the sliding window.
+        layer_types (`list`, *optional*):
+            Attention pattern for each layer.
+        final_logit_softcapping (`float`, *optional*):
+            Scaling factor when applying tanh softcapping on the logits.
+        attn_logit_softcapping (`float`, *optional*):
+            Scaling factor when applying tanh softcapping on the attention scores.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings used in global attention. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        rope_local_base_freq (float, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings for local attention.
+        use_bidirectional_attention (`bool`, *optional*, defaults to `False`): If True, the model will attend to all
+            text tokens instead of using a causal mask. This does not change behavior for vision tokens.
+
+    ```python
+    >>> from transformers import Gemma3TextModel, Gemma3TextConfig
+    >>> # Initializing a Gemma3Text gemma3_text-7b style configuration
+    >>> configuration = Gemma3TextConfig()
+    >>> # Initializing a model from the gemma3_text-7b style configuration
+    >>> model = Gemma3TextModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "gemma3_text"
+
+    def __init__(
+        self,
+        vocab_size=262_208,
+        hidden_size=2304,
+        intermediate_size=9216,
+        num_hidden_layers=26,
+        num_attention_heads=8,
+        num_key_value_heads=4,
+        head_dim=256,
+        hidden_activation="gelu_pytorch_tanh",
+        max_position_embeddings=131_072,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        eos_token_id=1,
+        bos_token_id=2,
+        tie_word_embeddings=True,
+        rope_theta=1_000_000.0,
+        attention_bias=False,
+        attention_dropout=0.0,
+        query_pre_attn_scalar=256,
+        sliding_window=4096,
+        layer_types=None,
+        final_logit_softcapping=None,
+        attn_logit_softcapping=None,
+        rope_scaling=None,
+        rope_local_base_freq=10_000.0,
+        use_bidirectional_attention=False,
+        **kwargs,
+    ):
+        PretrainedConfig.__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.head_dim = head_dim
+        self.num_key_value_heads = num_key_value_heads
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.hidden_activation = hidden_activation
+        self.query_pre_attn_scalar = query_pre_attn_scalar
+        self.sliding_window = sliding_window
+        self.final_logit_softcapping = final_logit_softcapping
+        self.attn_logit_softcapping = attn_logit_softcapping
+        self.layer_types = layer_types
+        self.use_bidirectional_attention = use_bidirectional_attention
+        if use_bidirectional_attention:
+            self.sliding_window = (self.sliding_window // 2) + 1  # due to fa we set exclusive bounds
+
+        self.rope_local_base_freq = rope_local_base_freq
+        self.rope_scaling = rope_scaling
+        rope_config_validation(self)
+
+        # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub
+        self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 6)
+
+        if self.layer_types is None:
+            self.layer_types = [
+                "sliding_attention" if bool((i + 1) % self._sliding_window_pattern) else "full_attention"
+                for i in range(self.num_hidden_layers)
+            ]
+        layer_type_validation(self.layer_types, self.num_hidden_layers)
+
+
+class Gemma3Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Gemma3ForConditionalGeneration`]. It is used to instantiate an
+    Gemma3ForConditionalGeneration according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the PaliGemma-2B.
+
+    e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`Union[Gemma3TextConfig, dict]`, *optional*):
+            The config object of the text backbone.
+        vision_config (`Union[AutoConfig, dict]`,  *optional*):
+            Custom vision config or dict.
+        mm_tokens_per_image (`int`, *optional*, defaults to 256):
+            The number of tokens per image embedding.
+        boi_token_index (`int`, *optional*, defaults to 255999):
+            The begin-of-image token index to wrap the image prompt.
+        eoi_token_index (`int`, *optional*, defaults to 256000):
+            The end-of-image token index to wrap the image prompt.
+        image_token_index (`int`, *optional*, defaults to 262144):
+            The image token index to encode the image prompt.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
+
+    Example:
+
+    ```python
+    >>> from transformers import Gemma3ForConditionalGeneration, Gemma3Config, SiglipVisionConfig, Gemma3TextConfig
+
+    >>> # Initializing a Siglip-like vision config
+    >>> vision_config = SiglipVisionConfig()
+
+    >>> # Initializing a Gemma3 Text config
+    >>> text_config = Gemma3TextConfig()
+
+    >>> # Initializing a Gemma3 gemma-3-4b style configuration
+    >>> configuration = Gemma3Config(vision_config, text_config)
+
+    >>> # Initializing a model from the gemma-3-4b style configuration
+    >>> model = Gemma3TextConfig(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "gemma3"
+    attribute_map = {
+        "image_token_id": "image_token_index",
+        "boi_token_id": "boi_token_index",
+        "eoi_token_id": "eoi_token_index",
+    }
+    sub_configs = {
+        "text_config": Gemma3TextConfig,
+        "vision_config": SiglipVisionConfig,
+    }
+
+    def __init__(
+        self,
+        text_config: Optional[Union[Gemma3TextConfig, dict[str, Any]]] = None,
+        vision_config: Optional[Union[SiglipVisionConfig, dict[str, Any]]] = None,
+        mm_tokens_per_image: int = 256,
+        boi_token_index: int = 255_999,
+        eoi_token_index: int = 256_000,
+        image_token_index: int = 262_144,
+        initializer_range: float = 0.02,
+        **kwargs,
+    ):
+        if text_config is None:
+            text_config = Gemma3TextConfig()
+            logger.info("text_config is None, using default Gemma3TextConfig text config.")
+        elif isinstance(text_config, dict):
+            text_config = Gemma3TextConfig(**text_config)
+
+        if isinstance(vision_config, dict):
+            vision_config = SiglipVisionConfig(**vision_config)
+        elif vision_config is None:
+            vision_config = SiglipVisionConfig()
+            logger.info("vision_config is None, using default SiglipVisionConfig vision config.")
+
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.mm_tokens_per_image = mm_tokens_per_image
+        self.boi_token_index = boi_token_index
+        self.eoi_token_index = eoi_token_index
+        self.image_token_index = image_token_index
+        self.initializer_range = initializer_range
+
+        super().__init__(**kwargs)
+
+
+class Gemma3ModelOutputWithPast(PaligemmaModelOutputWithPast):
+    pass
+
+
+class Gemma3CausalLMOutputWithPast(PaligemmaCausalLMOutputWithPast):
+    pass
+
+
+class Gemma3TextScaledWordEmbedding(nn.Embedding):
+    """
+    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: float = 1.0):
+        super().__init__(num_embeddings, embedding_dim, padding_idx)
+        self.register_buffer("embed_scale", torch.tensor(embed_scale), persistent=False)
+
+    def forward(self, input_ids: torch.Tensor):
+        return super().forward(input_ids) * self.embed_scale.to(self.weight.dtype)
+
+
+class Gemma3MLP(Gemma2MLP):
+    def __init__(self, config: Gemma3TextConfig):
+        super().__init__(config)
+
+
+class Gemma3RMSNorm(Gemma2RMSNorm):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__(dim=dim, eps=eps)
+
+
+class Gemma3RotaryEmbedding(Gemma2RotaryEmbedding):
+    def __init__(self, config: Gemma3TextConfig, device=None):
+        super().__init__(config)
+
+
+# Weird way to inherit but otherwise the sliding window gets defined first and can't access `is_sliding`
+class Gemma3Attention(Gemma2Attention):
+    def __init__(self, config: Gemma3TextConfig, layer_idx: int):
+        self.is_sliding = config.layer_types[layer_idx] == "sliding_attention"
+
+        super().__init__(config, layer_idx)
+        self.sliding_window = config.sliding_window if self.is_sliding else None
+        self.is_causal = not self.config.use_bidirectional_attention
+
+        self.q_norm = Gemma3RMSNorm(dim=config.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = Gemma3RMSNorm(dim=config.head_dim, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        query_states = self.q_norm(query_states)
+        key_states = self.k_norm(key_states)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=self.attention_dropout if self.training else 0.0,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Gemma3DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Gemma3TextConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.layer_idx = layer_idx
+        self.attention_type = config.layer_types[layer_idx]
+        self.self_attn = Gemma3Attention(config=config, layer_idx=layer_idx)
+        self.mlp = Gemma3MLP(config)
+        self.input_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.pre_feedforward_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.post_feedforward_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings_global: torch.Tensor,
+        position_embeddings_local: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # apply global RoPE to non-sliding layer only
+        if self.self_attn.is_sliding:
+            position_embeddings = position_embeddings_local
+        else:
+            position_embeddings = position_embeddings_global
+
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.pre_feedforward_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+GEMMA3_START_DOCSTRING = None
+
+
+class Gemma3PreTrainedModel(Gemma2PreTrainedModel):
+    base_model_prefix = ""
+    _no_split_modules = [
+        "Gemma3DecoderLayer",
+        "SiglipVisionEmbeddings",
+        "SiglipEncoderLayer",
+        "SiglipMultiheadAttentionPoolingHead",
+    ]
+
+    def _init_weights(self, module):
+        PreTrainedModel._init_weights(self, module)
+        if isinstance(module, Gemma3MultiModalProjector):
+            module.mm_input_projection_weight.data.zero_()
+        # We initialize with 0s to be 1 centered as the RMSNorm here does (1 + weight)
+        elif "RMSNorm" in module.__class__.__name__:
+            module.weight.data.zero_()
+
+
+def _bidirectional_window_overlay(sliding_window: int) -> Callable[[int, int, int, int], bool]:
+    """
+    Enables a bidirectional mask within the sliding window.
+    """
+
+    def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+        """A token can attend to any other token if their absolute distance is within
+        the (exclusive) sliding window size (distance < sliding_window)."""
+        return abs(q_idx - kv_idx) < sliding_window
+
+    return inner_mask
+
+
+class Gemma3TextModel(Gemma2Model):
+    config: Gemma3TextConfig
+
+    def __init__(self, config: Gemma3TextConfig):
+        super().__init__(config)
+
+        # Gemma3 downcasts the below to bfloat16, causing sqrt(3072)=55.4256 to become 55.5. See https://github.com/huggingface/transformers/pull/29402
+        self.embed_tokens = Gemma3TextScaledWordEmbedding(
+            config.vocab_size, config.hidden_size, self.padding_idx, embed_scale=self.config.hidden_size**0.5
+        )
+
+        # TODO: raushan fix this after RoPE refactor. For now we hack it by reassigning thetas
+        # when we want to create a local RoPE layer. Config defaults should hold values for global RoPE
+        config = copy.deepcopy(config)
+        config.rope_theta = config.rope_local_base_freq
+        config.rope_scaling = {"rope_type": "default"}
+        self.rotary_emb_local = Gemma3RotaryEmbedding(config=config)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None and not self.training:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens,
+                past_seen_tokens + inputs_embeds.shape[1],
+                device=inputs_embeds.device,
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            # Prepare mask arguments
+            mask_kwargs = {
+                "config": self.config,
+                "input_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            sliding_mask_kwargs = mask_kwargs.copy()
+
+            if self.config.use_bidirectional_attention:
+                mask_kwargs["or_mask_function"] = lambda *args: torch.tensor(True, dtype=torch.bool)
+                sliding_mask_kwargs["or_mask_function"] = _bidirectional_window_overlay(self.config.sliding_window)
+
+            # Create the masks
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+                "sliding_attention": create_sliding_window_causal_mask(**sliding_mask_kwargs),
+            }
+
+        # embed positions
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings_global = self.rotary_emb(hidden_states, position_ids)
+        position_embeddings_local = self.rotary_emb_local(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                position_embeddings_global=position_embeddings_global,
+                position_embeddings_local=position_embeddings_local,
+                attention_mask=causal_mask_mapping[decoder_layer.attention_type],
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                **kwargs,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class Gemma3ForCausalLM(Gemma2ForCausalLM):
+    config: Gemma3TextConfig
+    base_model_prefix = "language_model"
+
+    def __init__(self, config: Gemma3TextConfig):
+        super().__init__(config)
+        self.model = Gemma3TextModel(config)
+
+
+class Gemma3MultiModalProjector(nn.Module):
+    def __init__(self, config: Gemma3Config):
+        super().__init__()
+
+        self.mm_input_projection_weight = nn.Parameter(
+            torch.zeros(config.vision_config.hidden_size, config.text_config.hidden_size)
+        )
+
+        self.mm_soft_emb_norm = Gemma3RMSNorm(
+            config.vision_config.hidden_size, eps=config.vision_config.layer_norm_eps
+        )
+
+        self.patches_per_image = int(config.vision_config.image_size // config.vision_config.patch_size)
+        self.tokens_per_side = int(config.mm_tokens_per_image**0.5)
+        self.kernel_size = self.patches_per_image // self.tokens_per_side
+        self.avg_pool = nn.AvgPool2d(kernel_size=self.kernel_size, stride=self.kernel_size)
+
+    def forward(self, vision_outputs: torch.Tensor):
+        batch_size, _, seq_length = vision_outputs.shape
+
+        reshaped_vision_outputs = vision_outputs.transpose(1, 2)
+        reshaped_vision_outputs = reshaped_vision_outputs.reshape(
+            batch_size, seq_length, self.patches_per_image, self.patches_per_image
+        )
+        reshaped_vision_outputs = reshaped_vision_outputs.contiguous()
+
+        pooled_vision_outputs = self.avg_pool(reshaped_vision_outputs)
+        pooled_vision_outputs = pooled_vision_outputs.flatten(2)
+        pooled_vision_outputs = pooled_vision_outputs.transpose(1, 2)
+
+        normed_vision_outputs = self.mm_soft_emb_norm(pooled_vision_outputs)
+
+        projected_vision_outputs = torch.matmul(normed_vision_outputs, self.mm_input_projection_weight)
+        return projected_vision_outputs.type_as(vision_outputs)
+
+
+def token_type_ids_mask_function(
+    token_type_ids: Optional[torch.Tensor],
+    image_group_ids: Optional[torch.Tensor],
+    tokens_per_image: int,
+) -> Optional[Callable]:
+    """
+    This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
+    not start and end indices.
+    """
+    # Do not return an additional mask in this case
+    if token_type_ids is None:
+        return None
+
+    def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+        # If it's 1 for both query and key/value, we are in an image block
+        # NOTE: static cache shape goes beyond input seq length, while token_type_ids.shape[1] == input seq length
+        # Since vmap doesn't support `if statement` we workaround it with `torch.where`
+        safe_idx = torch.where(kv_idx < token_type_ids.shape[1], kv_idx, 0)
+        token_type_ids_at_kv_idx = token_type_ids[batch_idx, safe_idx]
+        token_type_ids_at_kv_idx = torch.where(kv_idx < token_type_ids.shape[1], token_type_ids_at_kv_idx, 0)
+
+        image_group_ids_at_kv_idx = image_group_ids[batch_idx, safe_idx]
+        image_group_ids_at_kv_idx = torch.where(kv_idx < image_group_ids.shape[1], image_group_ids_at_kv_idx, -1)
+
+        is_image_block = (token_type_ids[batch_idx, q_idx] == 1) & (token_type_ids_at_kv_idx == 1)
+        same_image_block = image_group_ids[batch_idx, q_idx] == image_group_ids_at_kv_idx
+
+        # This is bidirectional attention whenever we are dealing with image tokens
+        return is_image_block & same_image_block
+
+    return inner_mask
+
+
+class Gemma3Model(PaliGemmaModel):
+    # we are filtering the logits/labels so we shouldn't divide the loss based on num_items_in_batch
+    accepts_loss_kwargs = False
+
+    def __init__(self, config: Gemma3Config):
+        super().__init__(config)
+        del self.text_config_dtype
+
+    def get_image_features(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        """
+        Projects the last hidden state from the vision model into language model space.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
+               The tensors corresponding to the input images.
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        vision_outputs = self.vision_tower(pixel_values=pixel_values).last_hidden_state
+        image_features = self.multi_modal_projector(vision_outputs)
+        return image_features
+
+    def _update_causal_mask(self, **super_kwargs):
+        raise AttributeError("We don't want to inherit it")
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **lm_kwargs,
+    ) -> Union[tuple, Gemma3ModelOutputWithPast]:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # Replace image id with PAD if the image token if OOV, to avoid index-errors
+        if input_ids is not None and self.config.image_token_id >= self.vocab_size:
+            special_image_mask = input_ids == self.config.image_token_id
+            llm_input_ids = input_ids.clone()
+            llm_input_ids[special_image_mask] = 0
+        else:
+            llm_input_ids = input_ids
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(llm_input_ids)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        # Merge text and images
+        if pixel_values is not None:
+            image_features = self.get_image_features(pixel_values)
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            # Prepare mask arguments
+            mask_kwargs = {
+                "config": self.config.get_text_config(),
+                "input_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            # NOTE: this `is_prefill` logic is not flawless, it fails when we're using a cache eagerly initialized
+            # (e.g. compiled prefill) AND `pixel_values` are not provided. Determining prefill in that case requires
+            # checking data values, which is not compile-compatible.
+            is_prefill = (
+                not use_cache
+                or past_key_values is None
+                or not past_key_values.is_initialized
+                or pixel_values is not None
+            )
+            if token_type_ids is not None and is_prefill:
+                # We need to pass an additional mask function to account for token type ids, and it needs to be an `or`
+
+                # First find where a new image block starts: 1 if image and previous not image
+                # The images cannot attend to future images, but can attend to all prev images and to itself
+                # bidirectionally
+                is_image = (token_type_ids == 1).to(cache_position.device)
+                new_image_start = is_image & ~nn.functional.pad(is_image, (1, 0), value=0)[:, :-1]
+                image_group_ids = torch.cumsum(new_image_start.int(), dim=1) - 1
+                image_group_ids = torch.where(
+                    is_image, image_group_ids, torch.full_like(token_type_ids, -1, device=is_image.device)
+                )
+                mask_kwargs["or_mask_function"] = token_type_ids_mask_function(
+                    token_type_ids.to(cache_position.device), image_group_ids, self.config.mm_tokens_per_image
+                )
+
+            # Create the masks
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+                "sliding_attention": create_sliding_window_causal_mask(**mask_kwargs),
+            }
+
+        outputs = self.language_model(
+            attention_mask=causal_mask_mapping,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            **lm_kwargs,
+        )
+
+        return Gemma3ModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values if use_cache else None,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
+
+class Gemma3ForConditionalGeneration(PaliGemmaForConditionalGeneration):
+    # we are filtering the logits/labels so we shouldn't divide the loss based on num_items_in_batch
+    # Fix: https://github.com/huggingface/transformers/issues/40564
+    accepts_loss_kwargs = False
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **lm_kwargs,
+    ) -> Union[tuple, Gemma3CausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration
+
+        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
+        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")
+
+        >>> messages = [
+        ...     {
+        ...         "role": "system",
+        ...         "content": [
+        ...             {"type": "text", "text": "You are a helpful assistant."}
+        ...         ]
+        ...     },
+        ...     {
+        ...         "role": "user", "content": [
+        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
+        ...             {"type": "text", "text": "Where is the cat standing?"},
+        ...         ]
+        ...     },
+        ... ]
+
+        >>> inputs = processor.apply_chat_template(
+        ...     messages,
+        ...     tokenize=True,
+        ...     return_dict=True,
+        ...     return_tensors="pt",
+        ...     add_generation_prompt=True
+        ... )
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
+        ```
+        """
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            labels=labels,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            **lm_kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            shift_logits = logits[..., :-1, :]
+            shift_labels = labels[..., 1:]
+            if attention_mask is not None:
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -shift_logits.shape[1] :].to(logits.device)
+                shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
+            else:
+                shift_logits = shift_logits.contiguous()
+                shift_labels = shift_labels.contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+
+            flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
+            flat_labels = shift_labels.view(-1).to(shift_logits.device)
+            loss = loss_fct(flat_logits, flat_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return Gemma3CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        pixel_values=None,
+        attention_mask=None,
+        token_type_ids=None,
+        use_cache=True,
+        logits_to_keep=None,
+        labels=None,
+        **kwargs,
+    ):
+        # Overwritten -- custom `position_ids` and `pixel_values` handling
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            cache_position=cache_position,
+            use_cache=use_cache,
+            logits_to_keep=logits_to_keep,
+            token_type_ids=token_type_ids,
+            **kwargs,
+        )
+
+        # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+        # Otherwise we need pixel values to be passed to model. NOTE: use_cache=False needs pixel_values always
+        if cache_position[0] == 0:
+            model_inputs["pixel_values"] = pixel_values
+
+        return model_inputs
+
+    def _prepare_4d_causal_attention_mask_with_cache_position(self, **super_kwargs):
+        raise AttributeError("We don't want to inherit it")
+
+    @staticmethod
+    def create_masks_for_generate(
+        config: PretrainedConfig,
+        input_embeds: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+        cache_position: torch.Tensor,
+        past_key_values: Optional[Cache],
+        position_ids: Optional[torch.Tensor],
+        token_type_ids: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> dict:
+        # Prepare mask arguments
+        mask_kwargs = {
+            "config": config.get_text_config(),
+            "input_embeds": input_embeds,
+            "attention_mask": attention_mask,
+            "cache_position": cache_position,
+            "past_key_values": past_key_values,
+            "position_ids": position_ids,
+        }
+        # Add the token type ids mask for generate as well
+        if token_type_ids is not None and input_embeds.shape[1] != 1:
+            # We need to pass an additional mask function to account for token type ids, and it needs to be an `or`
+
+            # First find where a new image block starts: 1 if image and previous not image
+            # The images cannot attend to future images, but can attend to all prev images and to itself bidirectionally
+            is_image = (token_type_ids == 1).to(cache_position.device)
+            new_image_start = is_image & ~nn.functional.pad(is_image, (1, 0), value=0)[:, :-1]
+            image_group_ids = torch.cumsum(new_image_start.int(), dim=1) - 1
+            image_group_ids = torch.where(is_image, image_group_ids, torch.full_like(token_type_ids, -1))
+            mask_kwargs["or_mask_function"] = token_type_ids_mask_function(
+                token_type_ids.to(cache_position.device), image_group_ids, config.mm_tokens_per_image
+            )
+
+        return create_masks_for_generate(**mask_kwargs)
+
+
+class Gemma3ForSequenceClassification(Gemma3PreTrainedModel):
+    _checkpoint_conversion_mapping = {
+        "^language_model.model": "model.language_model",
+        "^vision_tower": "model.vision_tower",
+        "^multi_modal_projector": "model.multi_modal_projector",
+    }
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Gemma3Model(config)
+        self.score = nn.Linear(config.text_config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> SequenceClassifierOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            pixel_values=pixel_values,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            token_type_ids=token_type_ids,
+            use_cache=use_cache,
+            **kwargs,
+        )
+        hidden_states = transformer_outputs.last_hidden_state
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.text_config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.text_config.pad_token_id is None:
+            last_non_pad_token = -1
+        elif input_ids is not None:
+            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
+            non_pad_mask = (input_ids != self.config.text_config.pad_token_id).to(logits.device, torch.int32)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
+            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
+        else:
+            last_non_pad_token = -1
+            logger.warning_once(
+                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+            )
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+class Gemma3TextForSequenceClassification(GenericForSequenceClassification, Gemma3PreTrainedModel):
+    """
+    Gemma3TextForSequenceClassification is a text-only sequence classification model that works with Gemma3TextConfig.
+    It uses the generic sequence classification implementation for efficiency and consistency.
+    """
+
+    config: Gemma3TextConfig
+
+
+__all__ = [
+    "Gemma3Config",
+    "Gemma3TextConfig",
+    "Gemma3PreTrainedModel",
+    "Gemma3TextModel",
+    "Gemma3ForCausalLM",
+    "Gemma3ForConditionalGeneration",
+    "Gemma3Model",
+    "Gemma3ForSequenceClassification",
+    "Gemma3TextForSequenceClassification",
+]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/gemma3/processing_gemma3.py b/venv/lib/python3.13/site-packages/transformers/models/gemma3/processing_gemma3.py
new file mode 100644
index 0000000000000000000000000000000000000000..791c47833a4edfa3896f30a3cdae642cd7e20454
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/gemma3/processing_gemma3.py
@@ -0,0 +1,187 @@
+# coding=utf-8
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+from typing import Optional, Union
+
+import numpy as np
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput, make_nested_list_of_images
+from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import to_py_obj
+
+
+class Gemma3ImagesKwargs(ImagesKwargs):
+    do_pan_and_scan: Optional[bool]
+    pan_and_scan_min_crop_size: Optional[int]
+    pan_and_scan_max_num_crops: Optional[int]
+    pan_and_scan_min_ratio_to_activate: Optional[float]
+    do_convert_rgb: Optional[bool]
+
+
+class Gemma3ProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Gemma3ImagesKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+            "return_mm_token_type_ids": True,
+        },
+        "images_kwargs": {
+            "do_convert_rgb": True,
+            "do_pan_and_scan": False,
+            "pan_and_scan_min_crop_size": 256,
+            "pan_and_scan_max_num_crops": 4,
+            "pan_and_scan_min_ratio_to_activate": 1.2,
+        },
+    }
+
+
+class Gemma3Processor(ProcessorMixin):
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        image_processor,
+        tokenizer,
+        chat_template=None,
+        image_seq_length: int = 256,
+        **kwargs,
+    ):
+        self.image_seq_length = image_seq_length
+        self.image_token_id = tokenizer.image_token_id
+        self.boi_token = tokenizer.boi_token
+        self.image_token = tokenizer.image_token
+        image_tokens_expanded = "".join([tokenizer.image_token] * image_seq_length)
+        self.full_image_sequence = f"\n\n{tokenizer.boi_token}{image_tokens_expanded}{tokenizer.eoi_token}\n\n"
+
+        super().__init__(
+            image_processor=image_processor,
+            tokenizer=tokenizer,
+            chat_template=chat_template,
+            **kwargs,
+        )
+
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        videos=None,
+        audio=None,
+        **kwargs: Unpack[Gemma3ProcessorKwargs],
+    ) -> BatchFeature:
+        if text is None and images is None:
+            raise ValueError("Provide at least one of `text` or `images`.")
+
+        output_kwargs = self._merge_kwargs(
+            Gemma3ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise TypeError("Invalid input text. Please provide a string, or a list of strings")
+
+        image_inputs = {}
+        if images is not None:
+            images = self.image_processor.fetch_images(images)
+            batched_images = make_nested_list_of_images(images)
+            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
+
+            # Create empty text to be replaced with placeholders
+            if not text:
+                text = [" ".join([self.boi_token] * len(images)) for images in batched_images]
+
+            if len(batched_images) != len(text):
+                raise ValueError(
+                    f"Received inconsistently sized batches of images ({len(batched_images)}) and text ({len(text)})."
+                )
+
+            # Replace image tokens by the full expanded sequence
+            num_crops = to_py_obj(image_inputs.pop("num_crops"))
+            batch_num_crops = [[num_crops.pop(0) for _ in range(len(images))] for images in batched_images]
+            for batch_idx, (prompt, images, num_crops) in enumerate(zip(text, batched_images, batch_num_crops)):
+                image_indexes = [m.start() for m in re.finditer(self.boi_token, prompt)]
+
+                if len(images) != len(image_indexes):
+                    raise ValueError(
+                        f"Prompt contained {len(image_indexes)} image tokens but received {len(images)} images."
+                    )
+
+                # Insert additional image tokens for Pan-and-Scan crops
+                for num, idx in reversed(list(zip(num_crops, image_indexes))):
+                    if num:
+                        formatted_image_text = (
+                            f"Here is the original image {self.boi_token} and here are some crops to help you see better "
+                            + " ".join([self.boi_token] * num)
+                        )
+                        prompt = prompt[:idx] + formatted_image_text + prompt[idx + len(self.boi_token) :]
+                        text[batch_idx] = prompt
+
+            # Expand placeholder image tokens to the full image token sequence
+            text = [prompt.replace(self.boi_token, self.full_image_sequence) for prompt in text]
+
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
+        text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
+        self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
+
+        # Add token type ids manually, as tokenizer can't do arbitrary position token types
+        if return_mm_token_type_ids:
+            array_ids = np.array(text_inputs["input_ids"])
+            mm_token_type_ids = np.zeros_like(array_ids)
+            mm_token_type_ids[array_ids == self.image_token_id] = 1
+            text_inputs["token_type_ids"] = mm_token_type_ids.tolist()
+
+        return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
+
+    def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
+        """
+        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
+
+        Args:
+            image_sizes (`list[list[int]]`, *optional*):
+                The input sizes formatted as (height, width) per each image.
+
+        Returns:
+            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+            input modalities, along with other useful data.
+        """
+
+        vision_data = {}
+        if image_sizes is not None:
+            # NOTE: no image cropping supported yet
+            num_image_tokens = [self.image_seq_length] * len(image_sizes)
+            num_image_patches = [1] * len(image_sizes)
+
+            vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
+
+        return MultiModalData(**vision_data)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names + ["token_type_ids"]
+        image_processor_input_names = self.image_processor.model_input_names
+
+        image_processor_input_names = [name for name in image_processor_input_names if name != "num_crops"]
+        return list(tokenizer_input_names + image_processor_input_names)
+
+
+__all__ = ["Gemma3Processor"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/granite/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/granite/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..08b74d14ca871a7752b61fb71f08b7a7886d80f7
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/granite/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 EleutherAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_granite import *
+    from .modeling_granite import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/granite/configuration_granite.py b/venv/lib/python3.13/site-packages/transformers/models/granite/configuration_granite.py
new file mode 100644
index 0000000000000000000000000000000000000000..61d3ba9e7bb2775e537608d277e5973ec42a8cf9
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/granite/configuration_granite.py
@@ -0,0 +1,197 @@
+# coding=utf-8
+# Copyright 2024 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Granite model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class GraniteConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GraniteModel`]. It is used to instantiate an Granite
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Granite-3B.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Granite model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GraniteModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
+        embedding_multiplier (`float`, *optional*, defaults to 1.0): embedding multiplier
+        logits_scaling (`float`, *optional*, defaults to 1.0): divisor for output logits
+        residual_multiplier (`float`, *optional*, defaults to 1.0): residual multiplier
+        attention_multiplier (`float`, *optional*, defaults to 1.0): attention multiplier
+
+    ```python
+    >>> from transformers import GraniteModel, GraniteConfig
+
+    >>> # Initializing a Granite granite-3b style configuration
+    >>> configuration = GraniteConfig()
+
+    >>> # Initializing a model from the granite-7b style configuration
+    >>> model = GraniteModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "granite"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `GraniteModel`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mlp_bias=False,
+        embedding_multiplier=1.0,
+        logits_scaling=1.0,
+        residual_multiplier=1.0,
+        attention_multiplier=1.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+
+        self.embedding_multiplier = embedding_multiplier
+        self.logits_scaling = logits_scaling
+        self.residual_multiplier = residual_multiplier
+        self.attention_multiplier = attention_multiplier
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+        rope_config_validation(self)
+
+
+__all__ = ["GraniteConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/granite/modeling_granite.py b/venv/lib/python3.13/site-packages/transformers/models/granite/modeling_granite.py
new file mode 100644
index 0000000000000000000000000000000000000000..21cf1af6e2355cb7a8b143107d5f7e157488bf4a
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/granite/modeling_granite.py
@@ -0,0 +1,565 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/granite/modular_granite.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_granite.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 IBM and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import check_model_inputs
+from .configuration_granite import GraniteConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class GraniteAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: GraniteConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = config.attention_multiplier
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class GraniteRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        GraniteRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class GraniteMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+class GraniteDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: GraniteConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = GraniteAttention(config=config, layer_idx=layer_idx)
+
+        self.mlp = GraniteMLP(config)
+        self.input_layernorm = GraniteRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GraniteRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.residual_multiplier = config.residual_multiplier
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_values (`Cache`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states * self.residual_multiplier
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states * self.residual_multiplier  # main diff with Llama
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+@auto_docstring
+class GranitePreTrainedModel(PreTrainedModel):
+    config: GraniteConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["GraniteDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": GraniteDecoderLayer,
+        "attentions": GraniteAttention,
+    }
+
+
+class GraniteRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: GraniteConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+@auto_docstring
+class GraniteModel(GranitePreTrainedModel):
+    def __init__(self, config: GraniteConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [GraniteDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = GraniteRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = GraniteRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        self.embedding_multiplier = config.embedding_multiplier
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs()
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        inputs_embeds = inputs_embeds * self.embedding_multiplier  # main diff with Llama
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+@auto_docstring
+class GraniteForCausalLM(GranitePreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = GraniteModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, GraniteForCausalLM
+
+        >>> model = GraniteForCausalLM.from_pretrained("meta-granite/Granite-2-7b-hf")
+        >>> tokenizer = AutoTokenizer.from_pretrained("meta-granite/Granite-2-7b-hf")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        logits = logits / self.config.logits_scaling  # main diff with Llama
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["GraniteForCausalLM", "GraniteModel", "GranitePreTrainedModel"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/granite/modular_granite.py b/venv/lib/python3.13/site-packages/transformers/models/granite/modular_granite.py
new file mode 100644
index 0000000000000000000000000000000000000000..37e1955fcb0984ea62c9d37e73879667aab1ec1b
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/granite/modular_granite.py
@@ -0,0 +1,285 @@
+# coding=utf-8
+# Copyright 2024 IBM and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ...cache_utils import Cache, DynamicCache
+from ...masking_utils import create_causal_mask
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, logging
+from ...utils.deprecation import deprecate_kwarg
+from ..llama.modeling_llama import (
+    LlamaAttention,
+    LlamaDecoderLayer,
+    LlamaForCausalLM,
+    LlamaModel,
+    LlamaPreTrainedModel,
+)
+from .configuration_granite import GraniteConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class GraniteAttention(LlamaAttention):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: GraniteConfig, layer_idx: Optional[int] = None):
+        super().__init__(config, layer_idx)
+        self.scaling = config.attention_multiplier
+
+
+class GraniteDecoderLayer(LlamaDecoderLayer):
+    def __init__(self, config: GraniteConfig, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.residual_multiplier = config.residual_multiplier
+        self.self_attn = GraniteAttention(config=config, layer_idx=layer_idx)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_values (`Cache`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states * self.residual_multiplier
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states * self.residual_multiplier  # main diff with Llama
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+class GranitePreTrainedModel(LlamaPreTrainedModel):
+    pass
+
+
+class GraniteModel(LlamaModel):
+    def __init__(self, config: GraniteConfig):
+        super().__init__(config)
+        self.embedding_multiplier = config.embedding_multiplier
+        self.layers = nn.ModuleList(
+            [GraniteDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        inputs_embeds = inputs_embeds * self.embedding_multiplier  # main diff with Llama
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class GraniteForCausalLM(LlamaForCausalLM):
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        logits = logits / self.config.logits_scaling  # main diff with Llama
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["GraniteForCausalLM", "GraniteModel", "GranitePreTrainedModel"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/hgnet_v2/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/hgnet_v2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bf549dccf1d5b6bcb58f9337d4c97ca41536ce4
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/hgnet_v2/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_hgnet_v2 import *
+    from .modeling_hgnet_v2 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/hgnet_v2/configuration_hgnet_v2.py b/venv/lib/python3.13/site-packages/transformers/models/hgnet_v2/configuration_hgnet_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..09ce13dec29f1fa8df01edf611ee4566d812c61f
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/hgnet_v2/configuration_hgnet_v2.py
@@ -0,0 +1,152 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/hgnet_v2/modular_hgnet_v2.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_hgnet_v2.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Baidu Inc and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ...configuration_utils import PretrainedConfig
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+# TODO: Modular conversion for resnet must be fixed as
+# it provides incorrect import for configuration like resnet_resnet
+class HGNetV2Config(BackboneConfigMixin, PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`HGNetV2Backbone`]. It is used to instantiate a HGNet-V2
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of D-FINE-X-COCO B4 "[ustc-community/dfine_x_coco"](https://huggingface.co/ustc-community/dfine_x_coco").
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        embedding_size (`int`, *optional*, defaults to 64):
+            Dimensionality (hidden size) for the embedding layer.
+        depths (`list[int]`, *optional*, defaults to `[3, 4, 6, 3]`):
+            Depth (number of layers) for each stage.
+        hidden_sizes (`list[int]`, *optional*, defaults to `[256, 512, 1024, 2048]`):
+            Dimensionality (hidden size) at each stage.
+        hidden_act (`str`, *optional*, defaults to `"relu"`):
+            The non-linear activation function in each block. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"`
+            are supported.
+        out_features (`list[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        out_indices (`list[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        stem_channels (`list[int]`, *optional*, defaults to `[3, 32, 48]`):
+            Channel dimensions for the stem layers:
+            - First number (3) is input image channels
+            - Second number (32) is intermediate stem channels
+            - Third number (48) is output stem channels
+        stage_in_channels (`list[int]`, *optional*, defaults to `[48, 128, 512, 1024]`):
+            Input channel dimensions for each stage of the backbone.
+            This defines how many channels the input to each stage will have.
+        stage_mid_channels (`list[int]`, *optional*, defaults to `[48, 96, 192, 384]`):
+            Mid-channel dimensions for each stage of the backbone.
+            This defines the number of channels used in the intermediate layers of each stage.
+        stage_out_channels (`list[int]`, *optional*, defaults to `[128, 512, 1024, 2048]`):
+            Output channel dimensions for each stage of the backbone.
+            This defines how many channels the output of each stage will have.
+        stage_num_blocks (`list[int]`, *optional*, defaults to `[1, 1, 3, 1]`):
+            Number of blocks to be used in each stage of the backbone.
+            This controls the depth of each stage by specifying how many convolutional blocks to stack.
+        stage_downsample (`list[bool]`, *optional*, defaults to `[False, True, True, True]`):
+            Indicates whether to downsample the feature maps at each stage.
+            If `True`, the spatial dimensions of the feature maps will be reduced.
+        stage_light_block (`list[bool]`, *optional*, defaults to `[False, False, True, True]`):
+            Indicates whether to use light blocks in each stage.
+            Light blocks are a variant of convolutional blocks that may have fewer parameters.
+        stage_kernel_size (`list[int]`, *optional*, defaults to `[3, 3, 5, 5]`):
+            Kernel sizes for the convolutional layers in each stage.
+        stage_numb_of_layers (`list[int]`, *optional*, defaults to `[6, 6, 6, 6]`):
+            Number of layers to be used in each block of the stage.
+        use_learnable_affine_block (`bool`, *optional*, defaults to `False`):
+            Whether to use Learnable Affine Blocks (LAB) in the network.
+            LAB adds learnable scale and bias parameters after certain operations.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+    """
+
+    model_type = "hgnet_v2"
+
+    def __init__(
+        self,
+        num_channels=3,
+        embedding_size=64,
+        depths=[3, 4, 6, 3],
+        hidden_sizes=[256, 512, 1024, 2048],
+        hidden_act="relu",
+        out_features=None,
+        out_indices=None,
+        stem_channels=[3, 32, 48],
+        stage_in_channels=[48, 128, 512, 1024],
+        stage_mid_channels=[48, 96, 192, 384],
+        stage_out_channels=[128, 512, 1024, 2048],
+        stage_num_blocks=[1, 1, 3, 1],
+        stage_downsample=[False, True, True, True],
+        stage_light_block=[False, False, True, True],
+        stage_kernel_size=[3, 3, 5, 5],
+        stage_numb_of_layers=[6, 6, 6, 6],
+        use_learnable_affine_block=False,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.num_channels = num_channels
+        self.embedding_size = embedding_size
+        self.depths = depths
+        self.hidden_sizes = hidden_sizes
+        self.hidden_act = hidden_act
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+        self.stem_channels = stem_channels
+        self.stage_in_channels = stage_in_channels
+        self.stage_mid_channels = stage_mid_channels
+        self.stage_out_channels = stage_out_channels
+        self.stage_num_blocks = stage_num_blocks
+        self.stage_downsample = stage_downsample
+        self.stage_light_block = stage_light_block
+        self.stage_kernel_size = stage_kernel_size
+        self.stage_numb_of_layers = stage_numb_of_layers
+        self.use_learnable_affine_block = use_learnable_affine_block
+        self.initializer_range = initializer_range
+
+        if not (
+            len(stage_in_channels)
+            == len(stage_mid_channels)
+            == len(stage_out_channels)
+            == len(stage_num_blocks)
+            == len(stage_downsample)
+            == len(stage_light_block)
+            == len(stage_kernel_size)
+            == len(stage_numb_of_layers)
+        ):
+            raise ValueError("All stage configuration lists must have the same length.")
+
+
+__all__ = ["HGNetV2Config"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/hgnet_v2/modeling_hgnet_v2.py b/venv/lib/python3.13/site-packages/transformers/models/hgnet_v2/modeling_hgnet_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cd0e857afcdecbd536ecb2122e649a3fc0b1b6a
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/hgnet_v2/modeling_hgnet_v2.py
@@ -0,0 +1,476 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/hgnet_v2/modular_hgnet_v2.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_hgnet_v2.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Baidu Inc and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BackboneOutput, BaseModelOutputWithNoAttention, ImageClassifierOutputWithNoAttention
+from ...modeling_utils import PreTrainedModel
+from ...utils import auto_docstring
+from ...utils.backbone_utils import BackboneMixin
+from .configuration_hgnet_v2 import HGNetV2Config
+
+
+# General docstring
+
+
+@auto_docstring
+class HGNetV2PreTrainedModel(PreTrainedModel):
+    config: HGNetV2Config
+    base_model_prefix = "hgnetv2"
+    main_input_name = "pixel_values"
+    _no_split_modules = ["HGNetV2BasicLayer"]
+
+
+class HGNetV2LearnableAffineBlock(nn.Module):
+    def __init__(self, scale_value: float = 1.0, bias_value: float = 0.0):
+        super().__init__()
+        self.scale = nn.Parameter(torch.tensor([scale_value]), requires_grad=True)
+        self.bias = nn.Parameter(torch.tensor([bias_value]), requires_grad=True)
+
+    def forward(self, hidden_state: Tensor) -> Tensor:
+        hidden_state = self.scale * hidden_state + self.bias
+        return hidden_state
+
+
+class HGNetV2ConvLayer(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        groups: int = 1,
+        activation: str = "relu",
+        use_learnable_affine_block: bool = False,
+    ):
+        super().__init__()
+        self.convolution = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            groups=groups,
+            padding=(kernel_size - 1) // 2,
+            bias=False,
+        )
+        self.normalization = nn.BatchNorm2d(out_channels)
+        self.activation = ACT2FN[activation] if activation is not None else nn.Identity()
+        if activation and use_learnable_affine_block:
+            self.lab = HGNetV2LearnableAffineBlock()
+        else:
+            self.lab = nn.Identity()
+
+    def forward(self, input: Tensor) -> Tensor:
+        hidden_state = self.convolution(input)
+        hidden_state = self.normalization(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.lab(hidden_state)
+        return hidden_state
+
+
+class HGNetV2ConvLayerLight(nn.Module):
+    def __init__(
+        self, in_channels: int, out_channels: int, kernel_size: int, use_learnable_affine_block: bool = False
+    ):
+        super().__init__()
+        self.conv1 = HGNetV2ConvLayer(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            activation=None,
+            use_learnable_affine_block=use_learnable_affine_block,
+        )
+        self.conv2 = HGNetV2ConvLayer(
+            out_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            groups=out_channels,
+            use_learnable_affine_block=use_learnable_affine_block,
+        )
+
+    def forward(self, hidden_state: Tensor) -> Tensor:
+        hidden_state = self.conv1(hidden_state)
+        hidden_state = self.conv2(hidden_state)
+        return hidden_state
+
+
+class HGNetV2Embeddings(nn.Module):
+    def __init__(self, config: HGNetV2Config):
+        super().__init__()
+
+        self.stem1 = HGNetV2ConvLayer(
+            config.stem_channels[0],
+            config.stem_channels[1],
+            kernel_size=3,
+            stride=2,
+            activation=config.hidden_act,
+            use_learnable_affine_block=config.use_learnable_affine_block,
+        )
+        self.stem2a = HGNetV2ConvLayer(
+            config.stem_channels[1],
+            config.stem_channels[1] // 2,
+            kernel_size=2,
+            stride=1,
+            activation=config.hidden_act,
+            use_learnable_affine_block=config.use_learnable_affine_block,
+        )
+        self.stem2b = HGNetV2ConvLayer(
+            config.stem_channels[1] // 2,
+            config.stem_channels[1],
+            kernel_size=2,
+            stride=1,
+            activation=config.hidden_act,
+            use_learnable_affine_block=config.use_learnable_affine_block,
+        )
+        self.stem3 = HGNetV2ConvLayer(
+            config.stem_channels[1] * 2,
+            config.stem_channels[1],
+            kernel_size=3,
+            stride=2,
+            activation=config.hidden_act,
+            use_learnable_affine_block=config.use_learnable_affine_block,
+        )
+        self.stem4 = HGNetV2ConvLayer(
+            config.stem_channels[1],
+            config.stem_channels[2],
+            kernel_size=1,
+            stride=1,
+            activation=config.hidden_act,
+            use_learnable_affine_block=config.use_learnable_affine_block,
+        )
+
+        self.pool = nn.MaxPool2d(kernel_size=2, stride=1, ceil_mode=True)
+        self.num_channels = config.num_channels
+
+    def forward(self, pixel_values: Tensor) -> Tensor:
+        num_channels = pixel_values.shape[1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        embedding = self.stem1(pixel_values)
+        embedding = F.pad(embedding, (0, 1, 0, 1))
+        emb_stem_2a = self.stem2a(embedding)
+        emb_stem_2a = F.pad(emb_stem_2a, (0, 1, 0, 1))
+        emb_stem_2a = self.stem2b(emb_stem_2a)
+        pooled_emb = self.pool(embedding)
+        embedding = torch.cat([pooled_emb, emb_stem_2a], dim=1)
+        embedding = self.stem3(embedding)
+        embedding = self.stem4(embedding)
+        return embedding
+
+
+class HGNetV2BasicLayer(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        middle_channels: int,
+        out_channels: int,
+        layer_num: int,
+        kernel_size: int = 3,
+        residual: bool = False,
+        light_block: bool = False,
+        drop_path: float = 0.0,
+        use_learnable_affine_block: bool = False,
+    ):
+        super().__init__()
+        self.residual = residual
+
+        self.layers = nn.ModuleList()
+        for i in range(layer_num):
+            temp_in_channels = in_channels if i == 0 else middle_channels
+            if light_block:
+                block = HGNetV2ConvLayerLight(
+                    in_channels=temp_in_channels,
+                    out_channels=middle_channels,
+                    kernel_size=kernel_size,
+                    use_learnable_affine_block=use_learnable_affine_block,
+                )
+            else:
+                block = HGNetV2ConvLayer(
+                    in_channels=temp_in_channels,
+                    out_channels=middle_channels,
+                    kernel_size=kernel_size,
+                    use_learnable_affine_block=use_learnable_affine_block,
+                    stride=1,
+                )
+            self.layers.append(block)
+
+        # feature aggregation
+        total_channels = in_channels + layer_num * middle_channels
+        aggregation_squeeze_conv = HGNetV2ConvLayer(
+            total_channels,
+            out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            use_learnable_affine_block=use_learnable_affine_block,
+        )
+        aggregation_excitation_conv = HGNetV2ConvLayer(
+            out_channels // 2,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            use_learnable_affine_block=use_learnable_affine_block,
+        )
+        self.aggregation = nn.Sequential(
+            aggregation_squeeze_conv,
+            aggregation_excitation_conv,
+        )
+        self.drop_path = nn.Dropout(drop_path) if drop_path else nn.Identity()
+
+    def forward(self, hidden_state: Tensor) -> Tensor:
+        identity = hidden_state
+        output = [hidden_state]
+        for layer in self.layers:
+            hidden_state = layer(hidden_state)
+            output.append(hidden_state)
+        hidden_state = torch.cat(output, dim=1)
+        hidden_state = self.aggregation(hidden_state)
+        if self.residual:
+            hidden_state = self.drop_path(hidden_state) + identity
+        return hidden_state
+
+
+class HGNetV2Stage(nn.Module):
+    def __init__(self, config: HGNetV2Config, stage_index: int, drop_path: float = 0.0):
+        super().__init__()
+        in_channels = config.stage_in_channels[stage_index]
+        mid_channels = config.stage_mid_channels[stage_index]
+        out_channels = config.stage_out_channels[stage_index]
+        num_blocks = config.stage_num_blocks[stage_index]
+        num_layers = config.stage_numb_of_layers[stage_index]
+        downsample = config.stage_downsample[stage_index]
+        light_block = config.stage_light_block[stage_index]
+        kernel_size = config.stage_kernel_size[stage_index]
+        use_learnable_affine_block = config.use_learnable_affine_block
+
+        if downsample:
+            self.downsample = HGNetV2ConvLayer(
+                in_channels, in_channels, kernel_size=3, stride=2, groups=in_channels, activation=None
+            )
+        else:
+            self.downsample = nn.Identity()
+
+        blocks_list = []
+        for i in range(num_blocks):
+            blocks_list.append(
+                HGNetV2BasicLayer(
+                    in_channels if i == 0 else out_channels,
+                    mid_channels,
+                    out_channels,
+                    num_layers,
+                    residual=(i != 0),
+                    kernel_size=kernel_size,
+                    light_block=light_block,
+                    drop_path=drop_path,
+                    use_learnable_affine_block=use_learnable_affine_block,
+                )
+            )
+        self.blocks = nn.ModuleList(blocks_list)
+
+    def forward(self, hidden_state: Tensor) -> Tensor:
+        hidden_state = self.downsample(hidden_state)
+        for block in self.blocks:
+            hidden_state = block(hidden_state)
+        return hidden_state
+
+
+class HGNetV2Encoder(nn.Module):
+    def __init__(self, config: HGNetV2Config):
+        super().__init__()
+        self.stages = nn.ModuleList([])
+        for stage_index in range(len(config.stage_in_channels)):
+            resnet_stage = HGNetV2Stage(config, stage_index)
+            self.stages.append(resnet_stage)
+
+    def forward(
+        self, hidden_state: Tensor, output_hidden_states: bool = False, return_dict: bool = True
+    ) -> BaseModelOutputWithNoAttention:
+        hidden_states = () if output_hidden_states else None
+
+        for stage in self.stages:
+            if output_hidden_states:
+                hidden_states = hidden_states + (hidden_state,)
+
+            hidden_state = stage(hidden_state)
+
+        if output_hidden_states:
+            hidden_states = hidden_states + (hidden_state,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_state, hidden_states] if v is not None)
+
+        return BaseModelOutputWithNoAttention(
+            last_hidden_state=hidden_state,
+            hidden_states=hidden_states,
+        )
+
+
+class HGNetV2Backbone(HGNetV2PreTrainedModel, BackboneMixin):
+    has_attentions = False
+
+    def __init__(self, config: HGNetV2Config):
+        super().__init__(config)
+        super()._init_backbone(config)
+        self.depths = config.depths
+        self.num_features = [config.embedding_size] + config.hidden_sizes
+        self.embedder = HGNetV2Embeddings(config)
+        self.encoder = HGNetV2Encoder(config)
+
+        # initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
+    ) -> BackboneOutput:
+        r"""
+        Examples:
+
+        ```python
+        >>> from transformers import HGNetV2Config, HGNetV2Backbone
+        >>> import torch
+
+        >>> config = HGNetV2Config()
+        >>> model = HGNetV2Backbone(config)
+
+        >>> pixel_values = torch.randn(1, 3, 224, 224)
+
+        >>> with torch.no_grad():
+        ...     outputs = model(pixel_values)
+
+        >>> feature_maps = outputs.feature_maps
+        >>> list(feature_maps[-1].shape)
+        [1, 2048, 7, 7]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        embedding_output = self.embedder(pixel_values)
+
+        outputs = self.encoder(embedding_output, output_hidden_states=True, return_dict=True)
+
+        hidden_states = outputs.hidden_states
+
+        feature_maps = ()
+        for idx, stage in enumerate(self.stage_names):
+            if stage in self.out_features:
+                feature_maps += (hidden_states[idx],)
+
+        if not return_dict:
+            output = (feature_maps,)
+            if output_hidden_states:
+                output += (outputs.hidden_states,)
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=None,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    HGNetV2 Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
+    ImageNet.
+    """
+)
+class HGNetV2ForImageClassification(HGNetV2PreTrainedModel):
+    def __init__(self, config: HGNetV2Config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.embedder = HGNetV2Embeddings(config)
+        self.encoder = HGNetV2Encoder(config)
+        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
+        self.flatten = nn.Flatten()
+        self.fc = nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
+
+        # classification head
+        self.classifier = nn.ModuleList([self.avg_pool, self.flatten])
+
+        # initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> ImageClassifierOutputWithNoAttention:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Examples:
+        ```python
+        >>> import torch
+        >>> import requests
+        >>> from transformers import HGNetV2ForImageClassification, AutoImageProcessor
+        >>> from PIL import Image
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> model = HGNetV2ForImageClassification.from_pretrained("ustc-community/hgnet-v2")
+        >>> processor = AutoImageProcessor.from_pretrained("ustc-community/hgnet-v2")
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+        >>> outputs.logits.shape
+        torch.Size([1, 2])
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        embedding_output = self.embedder(pixel_values)
+        outputs = self.encoder(embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict)
+        last_hidden_state = outputs[0]
+        for layer in self.classifier:
+            last_hidden_state = layer(last_hidden_state)
+        logits = self.fc(last_hidden_state)
+        loss = None
+
+        if labels is not None:
+            loss = self.loss_function(labels, logits, self.config)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return (loss,) + output if loss is not None else output
+
+        return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
+
+
+__all__ = ["HGNetV2Backbone", "HGNetV2PreTrainedModel", "HGNetV2ForImageClassification"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/hgnet_v2/modular_hgnet_v2.py b/venv/lib/python3.13/site-packages/transformers/models/hgnet_v2/modular_hgnet_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0c46d688053d6f0d8da5067ef3798868e452f32
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/hgnet_v2/modular_hgnet_v2.py
@@ -0,0 +1,599 @@
+# coding=utf-8
+# Copyright 2025 Baidu Inc and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_outputs import (
+    BackboneOutput,
+    BaseModelOutputWithNoAttention,
+    ImageClassifierOutputWithNoAttention,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    auto_docstring,
+)
+from ...utils.backbone_utils import BackboneConfigMixin, BackboneMixin, get_aligned_output_features_output_indices
+from ..rt_detr.modeling_rt_detr_resnet import RTDetrResNetConvLayer
+
+
+# TODO: Modular conversion for resnet must be fixed as
+# it provides incorrect import for configuration like resnet_resnet
+class HGNetV2Config(BackboneConfigMixin, PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`HGNetV2Backbone`]. It is used to instantiate a HGNet-V2
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of D-FINE-X-COCO B4 "[ustc-community/dfine_x_coco"](https://huggingface.co/ustc-community/dfine_x_coco").
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        embedding_size (`int`, *optional*, defaults to 64):
+            Dimensionality (hidden size) for the embedding layer.
+        depths (`list[int]`, *optional*, defaults to `[3, 4, 6, 3]`):
+            Depth (number of layers) for each stage.
+        hidden_sizes (`list[int]`, *optional*, defaults to `[256, 512, 1024, 2048]`):
+            Dimensionality (hidden size) at each stage.
+        hidden_act (`str`, *optional*, defaults to `"relu"`):
+            The non-linear activation function in each block. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"`
+            are supported.
+        out_features (`list[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        out_indices (`list[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        stem_channels (`list[int]`, *optional*, defaults to `[3, 32, 48]`):
+            Channel dimensions for the stem layers:
+            - First number (3) is input image channels
+            - Second number (32) is intermediate stem channels
+            - Third number (48) is output stem channels
+        stage_in_channels (`list[int]`, *optional*, defaults to `[48, 128, 512, 1024]`):
+            Input channel dimensions for each stage of the backbone.
+            This defines how many channels the input to each stage will have.
+        stage_mid_channels (`list[int]`, *optional*, defaults to `[48, 96, 192, 384]`):
+            Mid-channel dimensions for each stage of the backbone.
+            This defines the number of channels used in the intermediate layers of each stage.
+        stage_out_channels (`list[int]`, *optional*, defaults to `[128, 512, 1024, 2048]`):
+            Output channel dimensions for each stage of the backbone.
+            This defines how many channels the output of each stage will have.
+        stage_num_blocks (`list[int]`, *optional*, defaults to `[1, 1, 3, 1]`):
+            Number of blocks to be used in each stage of the backbone.
+            This controls the depth of each stage by specifying how many convolutional blocks to stack.
+        stage_downsample (`list[bool]`, *optional*, defaults to `[False, True, True, True]`):
+            Indicates whether to downsample the feature maps at each stage.
+            If `True`, the spatial dimensions of the feature maps will be reduced.
+        stage_light_block (`list[bool]`, *optional*, defaults to `[False, False, True, True]`):
+            Indicates whether to use light blocks in each stage.
+            Light blocks are a variant of convolutional blocks that may have fewer parameters.
+        stage_kernel_size (`list[int]`, *optional*, defaults to `[3, 3, 5, 5]`):
+            Kernel sizes for the convolutional layers in each stage.
+        stage_numb_of_layers (`list[int]`, *optional*, defaults to `[6, 6, 6, 6]`):
+            Number of layers to be used in each block of the stage.
+        use_learnable_affine_block (`bool`, *optional*, defaults to `False`):
+            Whether to use Learnable Affine Blocks (LAB) in the network.
+            LAB adds learnable scale and bias parameters after certain operations.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+    """
+
+    model_type = "hgnet_v2"
+
+    def __init__(
+        self,
+        num_channels=3,
+        embedding_size=64,
+        depths=[3, 4, 6, 3],
+        hidden_sizes=[256, 512, 1024, 2048],
+        hidden_act="relu",
+        out_features=None,
+        out_indices=None,
+        stem_channels=[3, 32, 48],
+        stage_in_channels=[48, 128, 512, 1024],
+        stage_mid_channels=[48, 96, 192, 384],
+        stage_out_channels=[128, 512, 1024, 2048],
+        stage_num_blocks=[1, 1, 3, 1],
+        stage_downsample=[False, True, True, True],
+        stage_light_block=[False, False, True, True],
+        stage_kernel_size=[3, 3, 5, 5],
+        stage_numb_of_layers=[6, 6, 6, 6],
+        use_learnable_affine_block=False,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.num_channels = num_channels
+        self.embedding_size = embedding_size
+        self.depths = depths
+        self.hidden_sizes = hidden_sizes
+        self.hidden_act = hidden_act
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+        self.stem_channels = stem_channels
+        self.stage_in_channels = stage_in_channels
+        self.stage_mid_channels = stage_mid_channels
+        self.stage_out_channels = stage_out_channels
+        self.stage_num_blocks = stage_num_blocks
+        self.stage_downsample = stage_downsample
+        self.stage_light_block = stage_light_block
+        self.stage_kernel_size = stage_kernel_size
+        self.stage_numb_of_layers = stage_numb_of_layers
+        self.use_learnable_affine_block = use_learnable_affine_block
+        self.initializer_range = initializer_range
+
+        if not (
+            len(stage_in_channels)
+            == len(stage_mid_channels)
+            == len(stage_out_channels)
+            == len(stage_num_blocks)
+            == len(stage_downsample)
+            == len(stage_light_block)
+            == len(stage_kernel_size)
+            == len(stage_numb_of_layers)
+        ):
+            raise ValueError("All stage configuration lists must have the same length.")
+
+
+# General docstring
+
+
+@auto_docstring
+class HGNetV2PreTrainedModel(PreTrainedModel):
+    config: HGNetV2Config
+    base_model_prefix = "hgnetv2"
+    main_input_name = "pixel_values"
+    _no_split_modules = ["HGNetV2BasicLayer"]
+
+
+class HGNetV2LearnableAffineBlock(nn.Module):
+    def __init__(self, scale_value: float = 1.0, bias_value: float = 0.0):
+        super().__init__()
+        self.scale = nn.Parameter(torch.tensor([scale_value]), requires_grad=True)
+        self.bias = nn.Parameter(torch.tensor([bias_value]), requires_grad=True)
+
+    def forward(self, hidden_state: Tensor) -> Tensor:
+        hidden_state = self.scale * hidden_state + self.bias
+        return hidden_state
+
+
+class HGNetV2ConvLayer(RTDetrResNetConvLayer):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        groups: int = 1,
+        activation: str = "relu",
+        use_learnable_affine_block: bool = False,
+    ):
+        super().__init__(in_channels, out_channels, kernel_size, stride, activation)
+        self.convolution = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            groups=groups,
+            padding=(kernel_size - 1) // 2,
+            bias=False,
+        )
+        if activation and use_learnable_affine_block:
+            self.lab = HGNetV2LearnableAffineBlock()
+        else:
+            self.lab = nn.Identity()
+
+    def forward(self, input: Tensor) -> Tensor:
+        hidden_state = self.convolution(input)
+        hidden_state = self.normalization(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.lab(hidden_state)
+        return hidden_state
+
+
+class HGNetV2ConvLayerLight(nn.Module):
+    def __init__(
+        self, in_channels: int, out_channels: int, kernel_size: int, use_learnable_affine_block: bool = False
+    ):
+        super().__init__()
+        self.conv1 = HGNetV2ConvLayer(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            activation=None,
+            use_learnable_affine_block=use_learnable_affine_block,
+        )
+        self.conv2 = HGNetV2ConvLayer(
+            out_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            groups=out_channels,
+            use_learnable_affine_block=use_learnable_affine_block,
+        )
+
+    def forward(self, hidden_state: Tensor) -> Tensor:
+        hidden_state = self.conv1(hidden_state)
+        hidden_state = self.conv2(hidden_state)
+        return hidden_state
+
+
+class HGNetV2Embeddings(nn.Module):
+    def __init__(self, config: HGNetV2Config):
+        super().__init__()
+
+        self.stem1 = HGNetV2ConvLayer(
+            config.stem_channels[0],
+            config.stem_channels[1],
+            kernel_size=3,
+            stride=2,
+            activation=config.hidden_act,
+            use_learnable_affine_block=config.use_learnable_affine_block,
+        )
+        self.stem2a = HGNetV2ConvLayer(
+            config.stem_channels[1],
+            config.stem_channels[1] // 2,
+            kernel_size=2,
+            stride=1,
+            activation=config.hidden_act,
+            use_learnable_affine_block=config.use_learnable_affine_block,
+        )
+        self.stem2b = HGNetV2ConvLayer(
+            config.stem_channels[1] // 2,
+            config.stem_channels[1],
+            kernel_size=2,
+            stride=1,
+            activation=config.hidden_act,
+            use_learnable_affine_block=config.use_learnable_affine_block,
+        )
+        self.stem3 = HGNetV2ConvLayer(
+            config.stem_channels[1] * 2,
+            config.stem_channels[1],
+            kernel_size=3,
+            stride=2,
+            activation=config.hidden_act,
+            use_learnable_affine_block=config.use_learnable_affine_block,
+        )
+        self.stem4 = HGNetV2ConvLayer(
+            config.stem_channels[1],
+            config.stem_channels[2],
+            kernel_size=1,
+            stride=1,
+            activation=config.hidden_act,
+            use_learnable_affine_block=config.use_learnable_affine_block,
+        )
+
+        self.pool = nn.MaxPool2d(kernel_size=2, stride=1, ceil_mode=True)
+        self.num_channels = config.num_channels
+
+    def forward(self, pixel_values: Tensor) -> Tensor:
+        num_channels = pixel_values.shape[1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        embedding = self.stem1(pixel_values)
+        embedding = F.pad(embedding, (0, 1, 0, 1))
+        emb_stem_2a = self.stem2a(embedding)
+        emb_stem_2a = F.pad(emb_stem_2a, (0, 1, 0, 1))
+        emb_stem_2a = self.stem2b(emb_stem_2a)
+        pooled_emb = self.pool(embedding)
+        embedding = torch.cat([pooled_emb, emb_stem_2a], dim=1)
+        embedding = self.stem3(embedding)
+        embedding = self.stem4(embedding)
+        return embedding
+
+
+class HGNetV2BasicLayer(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        middle_channels: int,
+        out_channels: int,
+        layer_num: int,
+        kernel_size: int = 3,
+        residual: bool = False,
+        light_block: bool = False,
+        drop_path: float = 0.0,
+        use_learnable_affine_block: bool = False,
+    ):
+        super().__init__()
+        self.residual = residual
+
+        self.layers = nn.ModuleList()
+        for i in range(layer_num):
+            temp_in_channels = in_channels if i == 0 else middle_channels
+            if light_block:
+                block = HGNetV2ConvLayerLight(
+                    in_channels=temp_in_channels,
+                    out_channels=middle_channels,
+                    kernel_size=kernel_size,
+                    use_learnable_affine_block=use_learnable_affine_block,
+                )
+            else:
+                block = HGNetV2ConvLayer(
+                    in_channels=temp_in_channels,
+                    out_channels=middle_channels,
+                    kernel_size=kernel_size,
+                    use_learnable_affine_block=use_learnable_affine_block,
+                    stride=1,
+                )
+            self.layers.append(block)
+
+        # feature aggregation
+        total_channels = in_channels + layer_num * middle_channels
+        aggregation_squeeze_conv = HGNetV2ConvLayer(
+            total_channels,
+            out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            use_learnable_affine_block=use_learnable_affine_block,
+        )
+        aggregation_excitation_conv = HGNetV2ConvLayer(
+            out_channels // 2,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            use_learnable_affine_block=use_learnable_affine_block,
+        )
+        self.aggregation = nn.Sequential(
+            aggregation_squeeze_conv,
+            aggregation_excitation_conv,
+        )
+        self.drop_path = nn.Dropout(drop_path) if drop_path else nn.Identity()
+
+    def forward(self, hidden_state: Tensor) -> Tensor:
+        identity = hidden_state
+        output = [hidden_state]
+        for layer in self.layers:
+            hidden_state = layer(hidden_state)
+            output.append(hidden_state)
+        hidden_state = torch.cat(output, dim=1)
+        hidden_state = self.aggregation(hidden_state)
+        if self.residual:
+            hidden_state = self.drop_path(hidden_state) + identity
+        return hidden_state
+
+
+class HGNetV2Stage(nn.Module):
+    def __init__(self, config: HGNetV2Config, stage_index: int, drop_path: float = 0.0):
+        super().__init__()
+        in_channels = config.stage_in_channels[stage_index]
+        mid_channels = config.stage_mid_channels[stage_index]
+        out_channels = config.stage_out_channels[stage_index]
+        num_blocks = config.stage_num_blocks[stage_index]
+        num_layers = config.stage_numb_of_layers[stage_index]
+        downsample = config.stage_downsample[stage_index]
+        light_block = config.stage_light_block[stage_index]
+        kernel_size = config.stage_kernel_size[stage_index]
+        use_learnable_affine_block = config.use_learnable_affine_block
+
+        if downsample:
+            self.downsample = HGNetV2ConvLayer(
+                in_channels, in_channels, kernel_size=3, stride=2, groups=in_channels, activation=None
+            )
+        else:
+            self.downsample = nn.Identity()
+
+        blocks_list = []
+        for i in range(num_blocks):
+            blocks_list.append(
+                HGNetV2BasicLayer(
+                    in_channels if i == 0 else out_channels,
+                    mid_channels,
+                    out_channels,
+                    num_layers,
+                    residual=(i != 0),
+                    kernel_size=kernel_size,
+                    light_block=light_block,
+                    drop_path=drop_path,
+                    use_learnable_affine_block=use_learnable_affine_block,
+                )
+            )
+        self.blocks = nn.ModuleList(blocks_list)
+
+    def forward(self, hidden_state: Tensor) -> Tensor:
+        hidden_state = self.downsample(hidden_state)
+        for block in self.blocks:
+            hidden_state = block(hidden_state)
+        return hidden_state
+
+
+class HGNetV2Encoder(nn.Module):
+    def __init__(self, config: HGNetV2Config):
+        super().__init__()
+        self.stages = nn.ModuleList([])
+        for stage_index in range(len(config.stage_in_channels)):
+            resnet_stage = HGNetV2Stage(config, stage_index)
+            self.stages.append(resnet_stage)
+
+    def forward(
+        self, hidden_state: Tensor, output_hidden_states: bool = False, return_dict: bool = True
+    ) -> BaseModelOutputWithNoAttention:
+        hidden_states = () if output_hidden_states else None
+
+        for stage in self.stages:
+            if output_hidden_states:
+                hidden_states = hidden_states + (hidden_state,)
+
+            hidden_state = stage(hidden_state)
+
+        if output_hidden_states:
+            hidden_states = hidden_states + (hidden_state,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_state, hidden_states] if v is not None)
+
+        return BaseModelOutputWithNoAttention(
+            last_hidden_state=hidden_state,
+            hidden_states=hidden_states,
+        )
+
+
+class HGNetV2Backbone(HGNetV2PreTrainedModel, BackboneMixin):
+    has_attentions = False
+
+    def __init__(self, config: HGNetV2Config):
+        super().__init__(config)
+        super()._init_backbone(config)
+        self.depths = config.depths
+        self.num_features = [config.embedding_size] + config.hidden_sizes
+        self.embedder = HGNetV2Embeddings(config)
+        self.encoder = HGNetV2Encoder(config)
+
+        # initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
+    ) -> BackboneOutput:
+        r"""
+        Examples:
+
+        ```python
+        >>> from transformers import HGNetV2Config, HGNetV2Backbone
+        >>> import torch
+
+        >>> config = HGNetV2Config()
+        >>> model = HGNetV2Backbone(config)
+
+        >>> pixel_values = torch.randn(1, 3, 224, 224)
+
+        >>> with torch.no_grad():
+        ...     outputs = model(pixel_values)
+
+        >>> feature_maps = outputs.feature_maps
+        >>> list(feature_maps[-1].shape)
+        [1, 2048, 7, 7]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        embedding_output = self.embedder(pixel_values)
+
+        outputs = self.encoder(embedding_output, output_hidden_states=True, return_dict=True)
+
+        hidden_states = outputs.hidden_states
+
+        feature_maps = ()
+        for idx, stage in enumerate(self.stage_names):
+            if stage in self.out_features:
+                feature_maps += (hidden_states[idx],)
+
+        if not return_dict:
+            output = (feature_maps,)
+            if output_hidden_states:
+                output += (outputs.hidden_states,)
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=None,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    HGNetV2 Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
+    ImageNet.
+    """
+)
+class HGNetV2ForImageClassification(HGNetV2PreTrainedModel):
+    def __init__(self, config: HGNetV2Config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.embedder = HGNetV2Embeddings(config)
+        self.encoder = HGNetV2Encoder(config)
+        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
+        self.flatten = nn.Flatten()
+        self.fc = nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
+
+        # classification head
+        self.classifier = nn.ModuleList([self.avg_pool, self.flatten])
+
+        # initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> ImageClassifierOutputWithNoAttention:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Examples:
+        ```python
+        >>> import torch
+        >>> import requests
+        >>> from transformers import HGNetV2ForImageClassification, AutoImageProcessor
+        >>> from PIL import Image
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> model = HGNetV2ForImageClassification.from_pretrained("ustc-community/hgnet-v2")
+        >>> processor = AutoImageProcessor.from_pretrained("ustc-community/hgnet-v2")
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+        >>> outputs.logits.shape
+        torch.Size([1, 2])
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        embedding_output = self.embedder(pixel_values)
+        outputs = self.encoder(embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict)
+        last_hidden_state = outputs[0]
+        for layer in self.classifier:
+            last_hidden_state = layer(last_hidden_state)
+        logits = self.fc(last_hidden_state)
+        loss = None
+
+        if labels is not None:
+            loss = self.loss_function(labels, logits, self.config)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return (loss,) + output if loss is not None else output
+
+        return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
+
+
+__all__ = ["HGNetV2Config", "HGNetV2Backbone", "HGNetV2PreTrainedModel", "HGNetV2ForImageClassification"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/internvl/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/internvl/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d4ffe7befafddc423a58d41934a260137da0a13
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/internvl/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_internvl import *
+    from .modeling_internvl import *
+    from .processing_internvl import *
+    from .video_processing_internvl import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/internvl/configuration_internvl.py b/venv/lib/python3.13/site-packages/transformers/models/internvl/configuration_internvl.py
new file mode 100644
index 0000000000000000000000000000000000000000..17be5388b6ab7dcbf596c2deee69b4a467755fe2
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/internvl/configuration_internvl.py
@@ -0,0 +1,225 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ...configuration_utils import PretrainedConfig
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+class InternVLVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InternVLVisionModel`]. It is used to instantiate an InternVLVisionModel
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield
+    a similar configuration to that of the InternVL3-1B.
+    e.g. [OpenGVLab/InternVL3-1B-hf](https://huggingface.co/OpenGVLab/InternVL3-1B-hf)
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the queries, keys and values.
+        use_qk_norm (`bool`, *optional*, defaults to `False`):
+            Whether to apply normalization to the queries and keys before the attention operation.
+        intermediate_size (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout probability for attention weights.
+        projection_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout probability for the projection layer.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
+            The type of normalization to use in the encoder. Can be `"layer_norm"` or `"rms_norm"`.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        image_size (`int` or `list[int]`, *optional*, defaults to `[448, 448]`):
+            The size (resolution) of each image.
+        patch_size (`int` or `list[int]`, *optional*, defaults to `[14, 14]`):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        use_mask_token (`bool`, *optional*, defaults to `False`):
+            Whether to use a mask token for masked image modeling.
+        use_absolute_position_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to use BERT-style absolute position embeddings.
+        layer_scale_init_value (`float`, *optional*, defaults to 0.1):
+            Scale to use in the self-attention layers. 0.1 for base, 1e-5 for large. Set 0 to disable layer scale.
+        use_mean_pooling (`bool`, *optional*, defaults to `True`):
+            Whether to mean pool the final hidden states of the patches instead of using the final hidden state of the
+            CLS token, before applying the classification head.
+
+    Example:
+
+    ```python
+    >>> from transformers import InternVLVisionConfig, InternVLVisionModel
+
+    >>> # Initializing a InternVLVisionModel OpenGVLab/InternVL3-1B-hf style configuration
+    >>> configuration = InternVLVisionConfig()
+
+    >>> # Initializing a model (with random weights) from the OpenGVLab/InternVL3-1B-hf configuration
+    >>> model = InternVLVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "internvl_vision"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_size=1024,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        attention_bias=False,
+        use_qk_norm=False,
+        intermediate_size=4096,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_dropout=0.0,
+        projection_dropout=0.0,
+        initializer_range=0.02,
+        norm_type="layer_norm",
+        layer_norm_eps=1e-06,
+        image_size=[448, 448],
+        patch_size=[14, 14],
+        num_channels=3,
+        use_mask_token=False,
+        use_absolute_position_embeddings=True,
+        layer_scale_init_value=0.1,
+        use_mean_pooling=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.attention_bias = attention_bias
+        self.use_qk_norm = use_qk_norm
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_dropout = attention_dropout
+        self.projection_dropout = projection_dropout
+        self.initializer_range = initializer_range
+        self.norm_type = norm_type
+        self.layer_norm_eps = layer_norm_eps
+
+        image_size = image_size if isinstance(image_size, (list, tuple)) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, (list, tuple)) else (patch_size, patch_size)
+        self.image_size = image_size
+        self.patch_size = patch_size
+
+        self.num_channels = num_channels
+        self.use_mask_token = use_mask_token
+        self.use_absolute_position_embeddings = use_absolute_position_embeddings
+        self.layer_scale_init_value = layer_scale_init_value
+        self.use_mean_pooling = use_mean_pooling
+
+
+class InternVLConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InternVLForConditionalGeneration`]. It is used to instantiate a
+    InternVL model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of InternVL3-1B.
+    e.g. [OpenGVLab/InternVL3-1B-hf](https://huggingface.co/OpenGVLab/InternVL3-1B-hf)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `InternVisonConfig`):
+            The config object or dictionary of the vision backbone.
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Qwen2Config`):
+            The config object or dictionary of the text backbone.
+        image_token_id (`int`, *optional*, defaults to 151667):
+            The image token index to encode the image prompt.
+        image_seq_length (`int`, *optional*, defaults to 256):
+            Number of image tokens to use per image patch.
+        downsample_ratio (`float`, *optional*, defaults to 0.5):
+            Factor by which to downsample the image.
+        projector_hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the projector.
+        vision_feature_layer (`int`, *optional*, defaults to -1):
+            The index of the layer to use as the image features.
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Can be one of `"default"` or `"full"`.
+
+    ```python
+    >>> from transformers import InternVLForConditionalGeneration, InternVLConfig
+
+    >>> # Initializing a InternVL style configuration
+    >>> configuration = InternVLConfig()
+
+    >>> # Initializing a model (with random weights) from the OpenGVLab/InternVL3-1B-hf configuration
+    >>> model = InternVLForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "internvl"
+    sub_configs = {"text_config": AutoConfig, "vision_config": InternVLVisionConfig}
+
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        image_token_id=151667,
+        image_seq_length=256,
+        downsample_ratio=0.5,
+        projector_hidden_act="gelu",
+        vision_feature_layer=-1,
+        vision_feature_select_strategy="default",
+        **kwargs,
+    ):
+        self.image_token_id = image_token_id
+        self.image_seq_length = image_seq_length
+        self.downsample_ratio = downsample_ratio
+        self.projector_hidden_act = projector_hidden_act
+        self.vision_feature_layer = vision_feature_layer
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+
+        if isinstance(vision_config, dict):
+            self.vision_config = InternVLVisionConfig(**vision_config)
+        elif isinstance(vision_config, InternVLVisionConfig):
+            self.vision_config = vision_config
+        elif vision_config is None:
+            self.vision_config = InternVLVisionConfig()
+
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config.get("model_type", "qwen2")
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["qwen2"]()
+
+        self.text_config = text_config
+
+        super().__init__(**kwargs)
+
+
+__all__ = ["InternVLVisionConfig", "InternVLConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/internvl/modeling_internvl.py b/venv/lib/python3.13/site-packages/transformers/models/internvl/modeling_internvl.py
new file mode 100644
index 0000000000000000000000000000000000000000..df66e8b14cf161e29a1f7a1ca2e382353820646b
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/internvl/modeling_internvl.py
@@ -0,0 +1,949 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/internvl/modular_internvl.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_internvl.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import collections.abc
+from dataclasses import dataclass
+from typing import Callable, Optional, Union
+
+import torch
+import torch.nn as nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPast, BaseModelOutputWithPooling
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, torch_int
+from ...utils.generic import check_model_inputs
+from ..auto import AutoModel
+from .configuration_internvl import InternVLConfig, InternVLVisionConfig
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class InternVLVisionRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        InternVLVisionRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = key
+    value_states = value
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    # No upcasting of the attention weights to float32 in this implementation
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class InternVLVisionAttention(nn.Module):
+    """Attention Class for InternVL Vision Encoder"""
+
+    def __init__(self, config: InternVLVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        proj_dropout = config.projection_dropout
+        qk_norm = config.use_qk_norm
+
+        # Needed for flash attention
+        self.is_causal = False
+
+        self.q_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.projection_layer = nn.Linear(self.embed_dim, self.embed_dim)
+        self.projection_dropout = nn.Dropout(proj_dropout) if proj_dropout > 0 else nn.Identity()
+
+        self.q_norm = InternVLVisionRMSNorm(self.embed_dim) if qk_norm else nn.Identity()
+        self.k_norm = InternVLVisionRMSNorm(self.embed_dim) if qk_norm else nn.Identity()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ):
+        batch_size, seq_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = self.q_norm(query_states)
+        key_states = self.k_norm(key_states)
+
+        query_states = query_states.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scale,
+            is_causal=False,
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(batch_size, seq_len, self.embed_dim)
+
+        output = self.projection_layer(attn_output)
+        output = self.projection_dropout(output)
+
+        return output, attn_weights
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Class for outputs of [`InternVLVisionModel`].
+    """
+)
+class InternVLVisionModelOutputWithPooling(BaseModelOutputWithPooling):
+    r"""
+    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
+        Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
+        *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
+        will be returned.
+    """
+
+
+class InternVLVisionPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        patch_shape = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+        self.patch_shape = patch_shape
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+
+        embeddings = self.projection(pixel_values)
+        patch_height, patch_width = embeddings.shape[2], embeddings.shape[3]
+        embeddings = embeddings.flatten(2).transpose(1, 2)
+
+        return embeddings, (patch_height, patch_width)
+
+
+# Based on timm implementation, which can be found here:
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+class InternVLVisionEmbeddings(nn.Module):
+    """
+    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
+
+    """
+
+    def __init__(self, config: InternVLVisionConfig) -> None:
+        super().__init__()
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        if config.use_mask_token:
+            self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        else:
+            self.mask_token = None
+        self.patch_embeddings = InternVLVisionPatchEmbeddings(config)
+        self.patch_size = config.patch_size
+        self.image_size = (
+            config.image_size
+            if isinstance(config.image_size, collections.abc.Iterable)
+            else (config.image_size, config.image_size)
+        )
+        num_patches = self.patch_embeddings.num_patches
+        if config.use_absolute_position_embeddings:
+            self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
+        else:
+            self.position_embeddings = None
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embeddings
+
+        class_pos_embed = self.position_embeddings[:, :1]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size[0]
+        new_width = width // self.patch_size[1]
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+    ) -> torch.Tensor:
+        _, _, height, width = pixel_values.shape
+        embeddings, (patch_height, patch_width) = self.patch_embeddings(pixel_values)
+        batch_size, seq_len, _ = embeddings.size()
+
+        if bool_masked_pos is not None:
+            mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
+            # replace the masked visual tokens by mask_tokens
+            w = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+            embeddings = embeddings * (1 - w) + mask_tokens * w
+
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        if self.position_embeddings is not None:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings, (patch_height, patch_width)
+
+
+class InternVLVisionMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+NORM2FN = {"layer_norm": nn.LayerNorm, "rms_norm": InternVLVisionRMSNorm}
+
+
+class InternVLVisionLayer(GradientCheckpointingLayer):
+    """This corresponds to the Block class in the timm implementation."""
+
+    def __init__(self, config: InternVLVisionConfig) -> None:
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = InternVLVisionAttention(config)
+        self.mlp = InternVLVisionMLP(config)
+        # InternVL uses different layernorm implementations for different models
+        self.layernorm_before = NORM2FN[config.norm_type](config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = NORM2FN[config.norm_type](config.hidden_size, eps=config.layer_norm_eps)
+
+        init_values = config.layer_scale_init_value
+        self.lambda_1 = nn.Parameter(init_values * torch.ones(config.hidden_size), requires_grad=True)
+        self.lambda_2 = nn.Parameter(init_values * torch.ones(config.hidden_size), requires_grad=True)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> Union[tuple[torch.Tensor], tuple[torch.Tensor, torch.Tensor]]:
+        attention_output, _ = self.attention(
+            self.layernorm_before(hidden_states),  # in InternVLVision, layernorm is applied before self-attention
+        )
+
+        attention_output = self.lambda_1 * attention_output
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+
+        # in InternVLVision, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+
+        layer_output = self.mlp(layer_output)
+        layer_output = self.dropout(layer_output)
+
+        if self.lambda_2 is not None:
+            layer_output = self.lambda_2 * layer_output
+
+        # second residual connection
+        layer_output = layer_output + hidden_states
+
+        return layer_output
+
+
+class InternVLVisionEncoder(nn.Module):
+    def __init__(self, config: InternVLVisionConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([InternVLVisionLayer(config) for i in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> Union[tuple, BaseModelOutput]:
+        for layer_module in self.layer:
+            hidden_states = layer_module(hidden_states)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+        )
+
+
+@auto_docstring
+class InternVLVisionPreTrainedModel(PreTrainedModel):
+    config: InternVLVisionConfig
+    base_model_prefix = "internvl_vision"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["InternVLVisionLayer"]
+    _supports_sdpa = True
+    _supports_flash_attn = True
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+
+    _can_record_outputs = {
+        "hidden_states": InternVLVisionLayer,
+        "attentions": InternVLVisionAttention,
+    }
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        super()._init_weights(module)
+        if isinstance(module, InternVLVisionEmbeddings):
+            module.cls_token.data.zero_()
+            if module.mask_token is not None:
+                module.mask_token.data.zero_()
+            if module.position_embeddings is not None:
+                module.position_embeddings.data.zero_()
+        elif isinstance(module, InternVLVisionLayer):
+            module.lambda_1.data.fill_(self.config.layer_scale_init_value)
+            module.lambda_2.data.fill_(self.config.layer_scale_init_value)
+
+
+@auto_docstring
+class InternVLVisionModel(InternVLVisionPreTrainedModel):
+    def __init__(self, config: InternVLVisionConfig) -> None:
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = InternVLVisionEmbeddings(config)
+        self.encoder = InternVLVisionEncoder(config)
+
+        self.layernorm = (
+            nn.Identity() if config.use_mean_pooling else nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    @check_model_inputs(tie_last_hidden_states=False)
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+    ) -> Union[tuple, InternVLVisionModelOutputWithPooling]:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        """
+        embedding_output, _ = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
+
+        encoder_outputs = self.encoder(embedding_output)
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+
+        return InternVLVisionModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring
+class InternVLPreTrainedModel(PreTrainedModel):
+    config: InternVLConfig
+    base_model_prefix = ""
+    supports_gradient_checkpointing = True
+    _skip_keys_device_placement = "past_key_values"
+
+    _supports_flash_attn = True
+    _supports_sdpa = True
+
+    _can_compile_fullgraph = True
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+
+
+class InternVLMultiModalProjector(nn.Module):
+    def __init__(self, config: InternVLConfig):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size * int(1 / config.downsample_ratio) ** 2)
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size * int(1 / config.downsample_ratio) ** 2, config.text_config.hidden_size
+        )
+        self.act = ACT2FN[config.projector_hidden_act]
+        self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size)
+
+    def forward(self, image_features):
+        hidden_states = self.layer_norm(image_features)
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for InternVL outputs, with hidden states and attentions.
+    """
+)
+class InternVLModelOutputWithPast(BaseModelOutputWithPast):
+    r"""
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@auto_docstring(
+    custom_intro="""
+    The InternVL model which consists of a vision backbone and a language model, without a language modeling head.
+    """
+)
+class InternVLModel(InternVLPreTrainedModel):
+    _checkpoint_conversion_mapping = {"language_model.model": "language_model"}
+
+    def __init__(self, config: InternVLConfig):
+        super().__init__(config)
+        self.vision_tower = AutoModel.from_config(config.vision_config)
+
+        self.multi_modal_projector = InternVLMultiModalProjector(config)
+        self.language_model = AutoModel.from_config(config.text_config)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+
+    def get_decoder(self):
+        return self.language_model
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        **kwargs,
+    ):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
+               The tensors corresponding to the input images.
+            vision_feature_layer (`int` or `list[int]`):
+                Layer index or list of layer indices to extract features from.
+        Returns:
+            vision_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`.
+        """
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+        pixel_values = pixel_values.to(dtype=self.dtype)  # fp16 compatibility
+
+        downsample_ratio = self.config.downsample_ratio
+        if vision_feature_layer == -1:
+            vision_features = self.vision_tower(pixel_values=pixel_values).last_hidden_state
+        else:
+            vision_features = self.vision_model(pixel_values=pixel_values).hidden_states[vision_feature_layer]
+        if vision_feature_select_strategy == "default":
+            vision_features = vision_features[:, 1:, :]
+
+        # Calculate dimensions based on vision features
+        channels = vision_features.shape[1]
+        feature_size = int(channels**0.5)
+        batch_size = vision_features.shape[0]
+
+        # Reshape tensor to spatial dimensions
+        vision_features = vision_features.reshape(batch_size, feature_size, feature_size, -1)
+
+        # Apply downsampling using pixel shuffle
+        vision_features = self.pixel_shuffle(vision_features, scale_factor=downsample_ratio)
+
+        # Reshape tensor to prepare for projection
+        vision_features = vision_features.reshape(batch_size, -1, vision_features.shape[-1])
+
+        # Project features through multi-modal projector
+        vision_features = self.multi_modal_projector(vision_features)
+        return vision_features
+
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        n_image_features = image_features.shape[0] * image_features.shape[1]
+        if inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+            )
+        return special_image_mask
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, InternVLModelOutputWithPast]:
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None:
+            image_features = self.get_image_features(
+                pixel_values=pixel_values,
+                vision_feature_layer=vision_feature_layer,
+                vision_feature_select_strategy=vision_feature_select_strategy,
+            )
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        return InternVLModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
+    def pixel_shuffle(self, vision_features: torch.Tensor, scale_factor: float = 0.5):
+        """Perform pixel shuffle downsampling on vision features.
+
+        Args:
+            vision_features (`torch.Tensor`):
+                Input tensor of shape (batch_size, width, height, channels).
+            scale_factor (`float`, *optional*, defaults to `0.5`):
+                Factor by which to downsample. Default is 0.5, which halves the dimensions.
+
+        Returns:
+            vision_features (`torch.Tensor`):
+                Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)).
+        """
+        batch_size, width, height, channels = vision_features.size()
+
+        if height % scale_factor != 0 or width % scale_factor != 0:
+            raise ValueError("Height and width must be divisible by scale_factor for proper downsampling.")
+
+        # Reshape to allow downsampling
+        vision_features = vision_features.view(
+            batch_size, width, int(height * scale_factor), int(channels / scale_factor)
+        )
+        # Permute dimensions to align downsampled axis correctly
+        vision_features = vision_features.permute(0, 2, 1, 3).contiguous()
+
+        # Reshape to achieve final downsampled dimensions
+        vision_features = vision_features.view(
+            batch_size, int(height * scale_factor), int(width * scale_factor), int(channels / (scale_factor**2))
+        )
+
+        # Swap height and width back for proper orientation
+        vision_features = vision_features.permute(0, 2, 1, 3).contiguous()
+
+        return vision_features
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for InternVL causal language model (or autoregressive) outputs.
+    """
+)
+class InternVLCausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@auto_docstring(
+    custom_intro="""
+    The INTERNVL model which consists of a vision backbone and a language model.
+    """
+)
+class InternVLForConditionalGeneration(InternVLPreTrainedModel, GenerationMixin):
+    _checkpoint_conversion_mapping = {
+        "^language_model.model": "model.language_model",
+        "^vision_tower": "model.vision_tower",
+        "^multi_modal_projector": "model.multi_modal_projector",
+        "^language_model.lm_head": "lm_head",
+    }
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: InternVLConfig):
+        super().__init__(config)
+        self.model = InternVLModel(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.lm_head
+
+    def set_decoder(self, decoder):
+        self.model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        **kwargs,
+    ):
+        return self.model.get_image_features(
+            pixel_values=pixel_values,
+            vision_feature_layer=vision_feature_layer,
+            vision_feature_select_strategy=vision_feature_select_strategy,
+            **kwargs,
+        )
+
+    # Make modules available through conditional class for BC
+    @property
+    def language_model(self):
+        return self.model.language_model
+
+    @property
+    def vision_tower(self):
+        return self.model.vision_tower
+
+    @property
+    def multi_modal_projector(self):
+        return self.model.multi_modal_projector
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        labels: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        image_sizes: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, InternVLCausalLMOutputWithPast]:
+        r"""
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoProcessor, AutoModelForImageTextToText
+
+        >>> torch_device = "cuda"
+        >>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf")
+        >>> model = AutoModelForImageTextToText.from_pretrained(
+        ...     "OpenGVLab/InternVL3-1B-hf", dtype=torch.bfloat16, device_map=torch_device
+        ... )
+
+        >>> messages = [
+        ...     {
+        ...         "role": "user",
+        ...         "content": [
+        ...             {
+        ...                 "type": "image",
+        ...                 "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
+        ...             },
+        ...             {
+        ...                 "type": "image",
+        ...                 "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
+        ...             },
+        ...             {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
+        ...         ],
+        ...     },
+        ... ]
+
+        >>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
+        >>> generate_ids = model.generate(**inputs, max_new_tokens=200)
+        >>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
+        The images depict the Statue of Liberty and the Golden Gate Bridge.
+        ```"""
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            vision_feature_layer=vision_feature_layer,
+            vision_feature_select_strategy=vision_feature_select_strategy,
+            cache_position=cache_position,
+            image_sizes=image_sizes,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+            )
+
+        return InternVLCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        attention_mask=None,
+        cache_position=None,
+        logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        if cache_position[0] == 0:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            model_inputs["pixel_values"] = pixel_values
+
+        return model_inputs
+
+
+__all__ = [
+    "InternVLVisionPreTrainedModel",
+    "InternVLVisionModel",
+    "InternVLPreTrainedModel",
+    "InternVLModel",
+    "InternVLForConditionalGeneration",
+]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/internvl/modular_internvl.py b/venv/lib/python3.13/site-packages/transformers/models/internvl/modular_internvl.py
new file mode 100644
index 0000000000000000000000000000000000000000..d282de65a4b54b3544a75d5e7af737ae4efeae4e
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/internvl/modular_internvl.py
@@ -0,0 +1,655 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import collections.abc
+from dataclasses import dataclass
+from typing import Callable, Optional, Union
+
+import torch
+import torch.nn as nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_int
+from ...utils.generic import check_model_inputs
+from ..clip.modeling_clip import CLIPMLP
+from ..janus.modeling_janus import JanusVisionAttention
+from ..llama.modeling_llama import LlamaRMSNorm
+from ..llava.modeling_llava import (
+    LlavaCausalLMOutputWithPast,
+    LlavaForConditionalGeneration,
+    LlavaModel,
+    LlavaModelOutputWithPast,
+    LlavaPreTrainedModel,
+)
+from .configuration_internvl import InternVLConfig, InternVLVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = key
+    value_states = value
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    # No upcasting of the attention weights to float32 in this implementation
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class InternVLVisionRMSNorm(LlamaRMSNorm):
+    pass
+
+
+class InternVLVisionAttention(JanusVisionAttention):
+    def __init__(self, config: InternVLVisionConfig):
+        super().__init__(config)
+        del self.num_key_value_groups
+
+        # Needed for flash attention
+        self.is_causal = False
+        qk_norm = config.use_qk_norm
+
+        self.q_norm = InternVLVisionRMSNorm(self.embed_dim) if qk_norm else nn.Identity()
+        self.k_norm = InternVLVisionRMSNorm(self.embed_dim) if qk_norm else nn.Identity()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ):
+        batch_size, seq_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = self.q_norm(query_states)
+        key_states = self.k_norm(key_states)
+
+        query_states = query_states.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scale,
+            is_causal=False,
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(batch_size, seq_len, self.embed_dim)
+
+        output = self.projection_layer(attn_output)
+        output = self.projection_dropout(output)
+
+        return output, attn_weights
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Class for outputs of [`InternVLVisionModel`].
+    """
+)
+class InternVLVisionModelOutputWithPooling(BaseModelOutputWithPooling):
+    r"""
+    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
+        Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
+        *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
+        will be returned.
+    """
+
+
+class InternVLVisionPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        patch_shape = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+        self.patch_shape = patch_shape
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+
+        embeddings = self.projection(pixel_values)
+        patch_height, patch_width = embeddings.shape[2], embeddings.shape[3]
+        embeddings = embeddings.flatten(2).transpose(1, 2)
+
+        return embeddings, (patch_height, patch_width)
+
+
+# Based on timm implementation, which can be found here:
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+class InternVLVisionEmbeddings(nn.Module):
+    """
+    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
+
+    """
+
+    def __init__(self, config: InternVLVisionConfig) -> None:
+        super().__init__()
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        if config.use_mask_token:
+            self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        else:
+            self.mask_token = None
+        self.patch_embeddings = InternVLVisionPatchEmbeddings(config)
+        self.patch_size = config.patch_size
+        self.image_size = (
+            config.image_size
+            if isinstance(config.image_size, collections.abc.Iterable)
+            else (config.image_size, config.image_size)
+        )
+        num_patches = self.patch_embeddings.num_patches
+        if config.use_absolute_position_embeddings:
+            self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
+        else:
+            self.position_embeddings = None
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embeddings
+
+        class_pos_embed = self.position_embeddings[:, :1]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size[0]
+        new_width = width // self.patch_size[1]
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+    ) -> torch.Tensor:
+        _, _, height, width = pixel_values.shape
+        embeddings, (patch_height, patch_width) = self.patch_embeddings(pixel_values)
+        batch_size, seq_len, _ = embeddings.size()
+
+        if bool_masked_pos is not None:
+            mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
+            # replace the masked visual tokens by mask_tokens
+            w = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+            embeddings = embeddings * (1 - w) + mask_tokens * w
+
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        if self.position_embeddings is not None:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings, (patch_height, patch_width)
+
+
+class InternVLVisionMLP(CLIPMLP):
+    pass
+
+
+NORM2FN = {"layer_norm": nn.LayerNorm, "rms_norm": InternVLVisionRMSNorm}
+
+
+class InternVLVisionLayer(GradientCheckpointingLayer):
+    """This corresponds to the Block class in the timm implementation."""
+
+    def __init__(self, config: InternVLVisionConfig) -> None:
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = InternVLVisionAttention(config)
+        self.mlp = InternVLVisionMLP(config)
+        # InternVL uses different layernorm implementations for different models
+        self.layernorm_before = NORM2FN[config.norm_type](config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = NORM2FN[config.norm_type](config.hidden_size, eps=config.layer_norm_eps)
+
+        init_values = config.layer_scale_init_value
+        self.lambda_1 = nn.Parameter(init_values * torch.ones(config.hidden_size), requires_grad=True)
+        self.lambda_2 = nn.Parameter(init_values * torch.ones(config.hidden_size), requires_grad=True)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> Union[tuple[torch.Tensor], tuple[torch.Tensor, torch.Tensor]]:
+        attention_output, _ = self.attention(
+            self.layernorm_before(hidden_states),  # in InternVLVision, layernorm is applied before self-attention
+        )
+
+        attention_output = self.lambda_1 * attention_output
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+
+        # in InternVLVision, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+
+        layer_output = self.mlp(layer_output)
+        layer_output = self.dropout(layer_output)
+
+        if self.lambda_2 is not None:
+            layer_output = self.lambda_2 * layer_output
+
+        # second residual connection
+        layer_output = layer_output + hidden_states
+
+        return layer_output
+
+
+class InternVLVisionEncoder(nn.Module):
+    def __init__(self, config: InternVLVisionConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([InternVLVisionLayer(config) for i in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> Union[tuple, BaseModelOutput]:
+        for layer_module in self.layer:
+            hidden_states = layer_module(hidden_states)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+        )
+
+
+@auto_docstring
+class InternVLVisionPreTrainedModel(PreTrainedModel):
+    config: InternVLVisionConfig
+    base_model_prefix = "internvl_vision"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["InternVLVisionLayer"]
+    _supports_sdpa = True
+    _supports_flash_attn = True
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+
+    _can_record_outputs = {
+        "hidden_states": InternVLVisionLayer,
+        "attentions": InternVLVisionAttention,
+    }
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        super()._init_weights(module)
+        if isinstance(module, InternVLVisionEmbeddings):
+            module.cls_token.data.zero_()
+            if module.mask_token is not None:
+                module.mask_token.data.zero_()
+            if module.position_embeddings is not None:
+                module.position_embeddings.data.zero_()
+        elif isinstance(module, InternVLVisionLayer):
+            module.lambda_1.data.fill_(self.config.layer_scale_init_value)
+            module.lambda_2.data.fill_(self.config.layer_scale_init_value)
+
+
+@auto_docstring
+class InternVLVisionModel(InternVLVisionPreTrainedModel):
+    def __init__(self, config: InternVLVisionConfig) -> None:
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = InternVLVisionEmbeddings(config)
+        self.encoder = InternVLVisionEncoder(config)
+
+        self.layernorm = (
+            nn.Identity() if config.use_mean_pooling else nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    @check_model_inputs(tie_last_hidden_states=False)
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+    ) -> Union[tuple, InternVLVisionModelOutputWithPooling]:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        """
+        embedding_output, _ = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
+
+        encoder_outputs = self.encoder(embedding_output)
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+
+        return InternVLVisionModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class InternVLPreTrainedModel(LlavaPreTrainedModel):
+    pass
+
+
+INTERNVL_INPUTS_DOCSTRING = None
+
+
+class InternVLMultiModalProjector(nn.Module):
+    def __init__(self, config: InternVLConfig):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size * int(1 / config.downsample_ratio) ** 2)
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size * int(1 / config.downsample_ratio) ** 2, config.text_config.hidden_size
+        )
+        self.act = ACT2FN[config.projector_hidden_act]
+        self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size)
+
+    def forward(self, image_features):
+        hidden_states = self.layer_norm(image_features)
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class InternVLModelOutputWithPast(LlavaModelOutputWithPast):
+    pass
+
+
+class InternVLModel(LlavaModel):
+    def pixel_shuffle(self, vision_features: torch.Tensor, scale_factor: float = 0.5):
+        """Perform pixel shuffle downsampling on vision features.
+
+        Args:
+            vision_features (`torch.Tensor`):
+                Input tensor of shape (batch_size, width, height, channels).
+            scale_factor (`float`, *optional*, defaults to `0.5`):
+                Factor by which to downsample. Default is 0.5, which halves the dimensions.
+
+        Returns:
+            vision_features (`torch.Tensor`):
+                Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)).
+        """
+        batch_size, width, height, channels = vision_features.size()
+
+        if height % scale_factor != 0 or width % scale_factor != 0:
+            raise ValueError("Height and width must be divisible by scale_factor for proper downsampling.")
+
+        # Reshape to allow downsampling
+        vision_features = vision_features.view(
+            batch_size, width, int(height * scale_factor), int(channels / scale_factor)
+        )
+        # Permute dimensions to align downsampled axis correctly
+        vision_features = vision_features.permute(0, 2, 1, 3).contiguous()
+
+        # Reshape to achieve final downsampled dimensions
+        vision_features = vision_features.view(
+            batch_size, int(height * scale_factor), int(width * scale_factor), int(channels / (scale_factor**2))
+        )
+
+        # Swap height and width back for proper orientation
+        vision_features = vision_features.permute(0, 2, 1, 3).contiguous()
+
+        return vision_features
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        **kwargs,
+    ):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
+               The tensors corresponding to the input images.
+            vision_feature_layer (`int` or `list[int]`):
+                Layer index or list of layer indices to extract features from.
+        Returns:
+            vision_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`.
+        """
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+        pixel_values = pixel_values.to(dtype=self.dtype)  # fp16 compatibility
+
+        downsample_ratio = self.config.downsample_ratio
+        if vision_feature_layer == -1:
+            vision_features = self.vision_tower(pixel_values=pixel_values).last_hidden_state
+        else:
+            vision_features = self.vision_model(pixel_values=pixel_values).hidden_states[vision_feature_layer]
+        if vision_feature_select_strategy == "default":
+            vision_features = vision_features[:, 1:, :]
+
+        # Calculate dimensions based on vision features
+        channels = vision_features.shape[1]
+        feature_size = int(channels**0.5)
+        batch_size = vision_features.shape[0]
+
+        # Reshape tensor to spatial dimensions
+        vision_features = vision_features.reshape(batch_size, feature_size, feature_size, -1)
+
+        # Apply downsampling using pixel shuffle
+        vision_features = self.pixel_shuffle(vision_features, scale_factor=downsample_ratio)
+
+        # Reshape tensor to prepare for projection
+        vision_features = vision_features.reshape(batch_size, -1, vision_features.shape[-1])
+
+        # Project features through multi-modal projector
+        vision_features = self.multi_modal_projector(vision_features)
+        return vision_features
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, InternVLModelOutputWithPast]:
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None:
+            image_features = self.get_image_features(
+                pixel_values=pixel_values,
+                vision_feature_layer=vision_feature_layer,
+                vision_feature_select_strategy=vision_feature_select_strategy,
+            )
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        return InternVLModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
+
+class InternVLCausalLMOutputWithPast(LlavaCausalLMOutputWithPast):
+    pass
+
+
+class InternVLForConditionalGeneration(LlavaForConditionalGeneration):
+    def forward(**super_kwargs):
+        r"""
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoProcessor, AutoModelForImageTextToText
+
+        >>> torch_device = "cuda"
+        >>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf")
+        >>> model = AutoModelForImageTextToText.from_pretrained(
+        ...     "OpenGVLab/InternVL3-1B-hf", dtype=torch.bfloat16, device_map=torch_device
+        ... )
+
+        >>> messages = [
+        ...     {
+        ...         "role": "user",
+        ...         "content": [
+        ...             {
+        ...                 "type": "image",
+        ...                 "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
+        ...             },
+        ...             {
+        ...                 "type": "image",
+        ...                 "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
+        ...             },
+        ...             {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
+        ...         ],
+        ...     },
+        ... ]
+
+        >>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
+        >>> generate_ids = model.generate(**inputs, max_new_tokens=200)
+        >>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
+        The images depict the Statue of Liberty and the Golden Gate Bridge.
+        ```"""
+        super().forward(**super_kwargs)
+
+
+__all__ = [
+    "InternVLVisionPreTrainedModel",
+    "InternVLVisionModel",
+    "InternVLPreTrainedModel",
+    "InternVLModel",
+    "InternVLForConditionalGeneration",
+]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/internvl/processing_internvl.py b/venv/lib/python3.13/site-packages/transformers/models/internvl/processing_internvl.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c0c695754301dbfbe2498647c317947fc548c19
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/internvl/processing_internvl.py
@@ -0,0 +1,313 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BatchFeature
+from ...image_utils import ImageInput, concatenate_list, make_flat_list_of_images
+from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...video_utils import VideoInput
+
+
+class InternVLImagesKwargs(ImagesKwargs, total=False):
+    crop_to_patches: Optional[bool]
+    min_patches: Optional[int]
+    max_patches: Optional[int]
+
+
+class InternVLProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: InternVLImagesKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding_side": "left",
+            "return_mm_token_type_ids": False,
+        },
+        "images_kwargs": {
+            "crop_to_patches": True,
+        },
+        "videos_kwargs": {
+            "return_tensors": "pt",
+        },
+    }
+
+
+class InternVLProcessor(ProcessorMixin):
+    r"""
+    Constructs a InternVL processor which wraps a [`AutoImageProcessor`] and
+    [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and
+    tokenizer functionalities. See the [`~InternVLProcessor.__call__`] and [`~InternVLProcessor.decode`] for more information.
+    Args:
+        image_processor ([`AutoImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        video_processor ([`AutoVideoProcessor`], *optional*):
+            The video processor is a required input.
+        image_seq_length (`int`, *optional*, defaults to 256):
+            The number of image token to use per image patch. it should be set so that:
+            image_seq_length = (config.image_size // config.patch_size) ** 2 * (config.scale_factor**2)
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+
+    attributes = ["image_processor", "tokenizer", "video_processor"]
+    image_processor_class = "AutoImageProcessor"
+    video_processor_class = "AutoVideoProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        video_processor=None,
+        image_seq_length: int = 256,
+        chat_template=None,
+        **kwargs,
+    ):
+        self.image_seq_length = image_seq_length
+        self.start_image_token = tokenizer.start_image_token
+        self.end_image_token = tokenizer.end_image_token
+        self.start_image_token_id = tokenizer.start_image_token_id
+        self.end_image_token_id = tokenizer.end_image_token_id
+        self.image_token = tokenizer.context_image_token
+        self.video_token = tokenizer.video_token
+        self.image_token_id = tokenizer.context_image_token_id
+        self.image_ids = [self.image_token_id, self.start_image_token_id, self.end_image_token_id]
+
+        super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template, **kwargs)
+
+    def _insert_media_placeholders(
+        self,
+        text: list[str],
+        image_pixel_values,
+        video_pixel_values,
+        image_num_patches: list[int],
+        video_num_patches: list[int],
+        image_num_patches_indices: np.ndarray,
+        video_num_patches_indices: np.ndarray,
+        video_patch_indices: np.ndarray,
+    ):
+        """
+        Processes interleaved text with <image> and <video> placeholders, replacing them with appropriate
+        image and video tokens while keeping track of the patches used.
+        """
+        image_index = 0
+        video_index = 0
+        processed_text = []
+        image_video_patches = []
+        replace_strings = []
+        # Support interleaved image and video in prompts:
+        # Processed patches of images and videos are inserted in `image_video_patches` in the order they appear in the prompts
+        for prompt in text:
+            new_prompt = prompt
+            while self.image_token in new_prompt or self.video_token in new_prompt:
+                if self.image_token in new_prompt and (
+                    self.video_token not in new_prompt
+                    or new_prompt.index(self.image_token) < new_prompt.index(self.video_token)
+                ):
+                    # Get the slice of patches corresponding to the current image
+                    start_index = image_num_patches_indices[image_index - 1] if image_index > 0 else 0
+                    end_index = image_num_patches_indices[image_index]
+                    image_video_patches.append(image_pixel_values[start_index:end_index])
+                    # Replace the corresponding image placeholder with the correct number of image tokens
+                    new_prompt = new_prompt.replace(self.image_token, "<placeholder>", 1)
+                    replace_strings.append(
+                        f"{self.start_image_token}{self.image_token * self.image_seq_length * image_num_patches[image_index]}{self.end_image_token}"
+                    )
+                    image_index += 1
+                else:
+                    # Get the slice of patches corresponding to the current video
+                    # Here we need to account for both the multiple video frames and the potential multiple patches per frame
+                    # As of now, InternVL only supports one patch per frame, but we keep the code flexible for future updates
+                    current_patch_index = video_patch_indices[video_index]
+                    end_patch_index = video_patch_indices[video_index + 1]
+                    start_index = video_num_patches_indices[current_patch_index]
+                    end_index = video_num_patches_indices[end_patch_index]
+                    image_video_patches.append(video_pixel_values[start_index:end_index])
+                    # Get the number of patches per frame and replace the video placeholder with the correct number of image tokens
+                    num_patches = list(video_num_patches[current_patch_index:end_patch_index])
+                    video_prompt = "\n".join(
+                        f"Frame{i + 1}: {self.start_image_token}{self.image_token * self.image_seq_length * num_patches[i]}{self.end_image_token}"
+                        for i in range(len(num_patches))
+                    )
+                    replace_strings.append(video_prompt)
+                    new_prompt = new_prompt.replace(self.video_token, "<placeholder>", 1)
+                    video_index += 1
+            while "<placeholder>" in new_prompt:
+                replace_str = replace_strings.pop(0)
+                new_prompt = new_prompt.replace("<placeholder>", replace_str, 1)
+            processed_text.append(new_prompt)
+
+        return processed_text, image_video_patches, image_index, video_index
+
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
+        audio=None,
+        videos: Optional[VideoInput] = None,
+        **kwargs: Unpack[InternVLProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text if `text`
+        is not `None`, otherwise encode default OCR queries which depends on the `format`, `box`, `color`, `multi_page` and
+        `crop_to_patches` arguments. To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to
+        GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`.
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        if text is None:
+            raise ValueError("You have to specify text.")
+
+        output_kwargs = self._merge_kwargs(
+            InternVLProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        if not isinstance(text, (list, tuple)):
+            text = [text]
+
+        # Process images and videos separately, as videos don't support crop_to_patches
+        image_num_patches = []
+        image_pixel_values = None
+        image_num_patches_indices = np.array([0])
+        if images is not None:
+            images = self.image_processor.fetch_images(images)
+            images = make_flat_list_of_images(images)
+            image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
+            image_num_patches = image_inputs.pop("num_patches")
+            image_pixel_values = image_inputs.pop("pixel_values")
+            image_num_patches_indices = np.cumsum(image_num_patches)
+
+        video_num_patches = []  # per frame
+        video_pixel_values = None
+        video_patch_indices = np.array([0])
+        video_num_patches_indices = np.array([0])
+        if videos is not None:
+            video_kwargs = output_kwargs["videos_kwargs"]
+            video_inputs = self.video_processor(videos=videos, **video_kwargs)
+            video_pixel_values = video_inputs.pop("pixel_values_videos")
+
+            batch_size, num_frames, *_ = video_pixel_values.shape
+            num_frames_per_video = np.full(batch_size, num_frames)
+            num_frames = sum(num_frames_per_video)  # total
+            video_patch_indices = np.empty(batch_size + 1, int)
+            video_patch_indices[0] = 0
+            video_patch_indices[1:] = np.cumsum(num_frames_per_video)
+            video_num_patches = [1] * num_frames
+            video_num_patches_indices = np.empty(num_frames + 1, int)
+            video_num_patches_indices[0] = 0
+            video_num_patches_indices[1:] = np.cumsum(video_num_patches)
+            video_pixel_values = video_pixel_values.flatten(0, 1)
+
+        image_videos_inputs = {}
+        if images is not None or videos is not None:
+            text, image_video_patches, image_index, video_index = self._insert_media_placeholders(
+                text,
+                image_pixel_values,
+                video_pixel_values,
+                image_num_patches,
+                video_num_patches,
+                image_num_patches_indices,
+                video_num_patches_indices,
+                video_patch_indices,
+            )
+            if images is not None and image_index != len(images):
+                raise ValueError("Number of image placeholders in the prompt does not match the number of images.")
+            if videos is not None and video_index != len(num_frames_per_video):
+                raise ValueError("Number of video placeholders in the prompt does not match the number of videos.")
+
+            # Concatenate the interleaved image and video patches (function agnostic to the patches type (list, numpy array, torch tensor))
+            image_videos_inputs = {"pixel_values": concatenate_list(image_video_patches)}
+
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+        self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
+
+        if return_mm_token_type_ids:
+            array_ids = np.array(text_inputs["input_ids"])
+            mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
+            mm_token_type_ids[np.isin(array_ids, self.image_ids)] = 1
+            text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
+
+        return BatchFeature(data={**text_inputs, **image_videos_inputs}, tensor_type=return_tensors)
+
+    def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
+        """
+        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
+
+        Args:
+            image_sizes (`list[list[int]]`, *optional*):
+                The input sizes formatted as (height, width) per each image.
+
+        Returns:
+            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+            input modalities, along with other useful data.
+        """
+
+        vision_data = {}
+        if image_sizes is not None:
+            images_kwargs = InternVLProcessorKwargs._defaults.get("images_kwargs", {})
+            images_kwargs.update(kwargs)
+
+            num_image_patches = [
+                self.image_processor.get_number_of_image_patches(*image_size, images_kwargs)
+                for image_size in image_sizes
+            ]
+            # Add 2 for BOI and EOI tokens
+            num_image_tokens = [2 + (self.image_seq_length * num_patches) for num_patches in num_image_patches]
+            vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
+
+        return MultiModalData(**vision_data)
+
+    @property
+    def model_input_names(self):
+        # Overwritten because InternVL renames video inputs to `pixel_values` before returning
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return tokenizer_input_names + image_processor_input_names
+
+
+__all__ = ["InternVLProcessor"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/internvl/video_processing_internvl.py b/venv/lib/python3.13/site-packages/transformers/models/internvl/video_processing_internvl.py
new file mode 100644
index 0000000000000000000000000000000000000000..96d7d3067f73f6b3af0b0f5155564dd6965f7e59
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/internvl/video_processing_internvl.py
@@ -0,0 +1,149 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Video processor class for InternVL."""
+
+from typing import Optional, Union
+
+import torch
+from torchvision.transforms.v2 import functional as F
+
+from ...image_processing_utils import BatchFeature
+from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling, SizeDict
+from ...processing_utils import Unpack, VideosKwargs
+from ...utils import TensorType
+from ...video_processing_utils import BaseVideoProcessor
+from ...video_utils import VideoMetadata, group_videos_by_shape, reorder_videos
+
+
+class InternVLVideoProcessorInitKwargs(VideosKwargs):
+    initial_shift: Union[bool, float, int]
+
+
+class InternVLVideoProcessor(BaseVideoProcessor):
+    resample = PILImageResampling.BICUBIC
+    image_mean = OPENAI_CLIP_MEAN
+    image_std = OPENAI_CLIP_STD
+    size = {"height": 384, "width": 384}
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    do_convert_rgb = True
+    initial_shift = True
+    do_sample_frames = False  # Set to False for BC, recommended to set `True` in new models
+    valid_kwargs = InternVLVideoProcessorInitKwargs
+    model_input_names = ["pixel_values_videos"]
+
+    def __init__(self, **kwargs: Unpack[InternVLVideoProcessorInitKwargs]):
+        super().__init__(**kwargs)
+
+    def sample_frames(
+        self,
+        metadata: VideoMetadata,
+        num_frames: Optional[int] = None,
+        fps: Optional[Union[int, float]] = None,
+        initial_shift: Optional[Union[bool, float, int]] = None,
+        **kwargs,
+    ):
+        """
+        Default sampling function which uniformly samples the desired number of frames between 0 and total number of frames.
+        If `fps` is passed along with metadata, `fps` frames per second are sampled uniformty. Arguments `num_frames`
+        and `fps` are mutually exclusive.
+
+        Args:
+            metadata (`VideoMetadata`):
+                Metadata of the video containing information about total duration, fps and total number of frames.
+            num_frames (`int`, *optional*):
+                Maximum number of frames to sample. Defaults to `self.num_frames`.
+            fps (`int` or `float`, *optional*):
+                Target frames to sample per second. Defaults to `self.fps`.
+            initial_shift (`bool`, `float` or `int`, defaults to `self.initial_shift`):
+                The initial shift to apply when sampling frames. If `True`, the shift is set so that frames are sampled from the middle of the video.
+
+        Returns:
+            np.ndarray:
+                Indices to sample video frames.
+        """
+        num_frames = num_frames if num_frames is not None else self.num_frames
+        initial_shift = initial_shift if initial_shift is not None else self.initial_shift
+        total_num_frames = metadata.total_num_frames
+
+        # If num_frames is not given but fps is, calculate num_frames from fps
+        if num_frames is None and fps is not None:
+            if metadata is None or metadata.fps is None:
+                raise ValueError(
+                    "Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. "
+                    "Please pass in `VideoMetadata` object or use a fixed `num_frames` per input video"
+                )
+            num_frames = int(total_num_frames / metadata.fps * fps)
+
+        if initial_shift is True:
+            initial_shift = total_num_frames / num_frames / 2
+
+        if num_frames > total_num_frames:
+            raise ValueError(
+                f"Video can't be sampled. The `num_frames={num_frames}` exceeds `total_num_frames={total_num_frames}`. "
+            )
+
+        indices = torch.arange(initial_shift, total_num_frames, total_num_frames / num_frames).int()
+        return indices
+
+    def _preprocess(
+        self,
+        videos: list["torch.Tensor"],
+        do_convert_rgb: bool,
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        # Group videos by size for batched resizing
+        grouped_videos, grouped_videos_index = group_videos_by_shape(videos)
+        resized_videos_grouped = {}
+        for shape, stacked_videos in grouped_videos.items():
+            if do_convert_rgb:
+                stacked_videos = self.convert_to_rgb(stacked_videos)
+            if do_resize:
+                stacked_videos = self.resize(stacked_videos, size=size, interpolation=interpolation)
+            resized_videos_grouped[shape] = stacked_videos
+        resized_videos = reorder_videos(resized_videos_grouped, grouped_videos_index)
+
+        # Group videos by size for further processing
+        # Needed in case do_resize is False, or resize returns videos with different sizes
+        grouped_videos, grouped_videos_index = group_videos_by_shape(resized_videos)
+        processed_videos_grouped = {}
+        for shape, stacked_videos in grouped_videos.items():
+            if do_center_crop:
+                stacked_videos = self.center_crop(stacked_videos, crop_size)
+            # Fused rescale and normalize
+            stacked_videos = self.rescale_and_normalize(
+                stacked_videos, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_videos_grouped[shape] = stacked_videos
+
+        processed_videos = reorder_videos(processed_videos_grouped, grouped_videos_index)
+        processed_videos = torch.stack(processed_videos, dim=0) if return_tensors else processed_videos
+
+        return BatchFeature(data={"pixel_values_videos": processed_videos}, tensor_type=return_tensors)
+
+
+__all__ = ["InternVLVideoProcessor"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/llava/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/llava/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..aadd45dc13ed4074437cc6f224b0348de110f292
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/llava/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_llava import *
+    from .image_processing_llava_fast import *
+    from .modeling_llava import *
+    from .processing_llava import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/llava/configuration_llava.py b/venv/lib/python3.13/site-packages/transformers/models/llava/configuration_llava.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ae710c011986f422d965adcdb758b29e3113690
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/llava/configuration_llava.py
@@ -0,0 +1,137 @@
+# coding=utf-8
+# Copyright 2023 Microsoft Research & University of Wisconsin-Madison and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Llava model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class LlavaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LlavaForConditionalGeneration`]. It is used to instantiate an
+    Llava model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Llava-9B.
+
+    e.g. [llava-hf/llava-9b](https://huggingface.co/llava-hf/llava-9b)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `CLIPVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
+            The config object or dictionary of the text backbone.
+        image_token_index (`int`, *optional*, defaults to 32000):
+            The image token index to encode the image prompt.
+        projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The activation function used by the multimodal projector.
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Can be one of `"default"` or `"full"`.
+        vision_feature_layer (`Union[int, list[int]]`, *optional*, defaults to -2):
+            The index of the layer to select the vision feature. If multiple indices are provided,
+            the vision feature of the corresponding indices will be concatenated to form the
+            vision features.
+        image_seq_length (`int`, *optional*, defaults to 576):
+            Sequence length of one image embedding.
+        multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the multimodal projector.
+
+    Example:
+
+    ```python
+    >>> from transformers import LlavaForConditionalGeneration, LlavaConfig, CLIPVisionConfig, LlamaConfig
+
+    >>> # Initializing a CLIP-vision config
+    >>> vision_config = CLIPVisionConfig()
+
+    >>> # Initializing a Llama config
+    >>> text_config = LlamaConfig()
+
+    >>> # Initializing a Llava llava-1.5-7b style configuration
+    >>> configuration = LlavaConfig(vision_config, text_config)
+
+    >>> # Initializing a model from the llava-1.5-7b style configuration
+    >>> model = LlavaForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "llava"
+    attribute_map = {
+        "image_token_id": "image_token_index",
+    }
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
+
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        image_token_index=32000,
+        projector_hidden_act="gelu",
+        vision_feature_select_strategy="default",
+        vision_feature_layer=-2,
+        image_seq_length=576,
+        multimodal_projector_bias=True,
+        **kwargs,
+    ):
+        self.image_token_index = image_token_index
+        self.projector_hidden_act = projector_hidden_act
+        self.image_seq_length = image_seq_length
+
+        if vision_feature_select_strategy not in ["default", "full"]:
+            raise ValueError(
+                "vision_feature_select_strategy should be one of 'default', 'full'."
+                f"Got: {vision_feature_select_strategy}"
+            )
+
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.vision_feature_layer = vision_feature_layer
+
+        if isinstance(vision_config, dict):
+            vision_config["model_type"] = vision_config.get("model_type", "clip_vision_model")
+            vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+        elif vision_config is None:
+            vision_config = CONFIG_MAPPING["clip_vision_model"](
+                intermediate_size=4096,
+                hidden_size=1024,
+                patch_size=14,
+                image_size=336,
+                num_hidden_layers=24,
+                num_attention_heads=16,
+                vocab_size=32000,
+                projection_dim=768,
+            )
+
+        self.vision_config = vision_config
+
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config.get("model_type", "llama")
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["llama"]()
+
+        self.text_config = text_config
+        self.multimodal_projector_bias = multimodal_projector_bias
+
+        super().__init__(**kwargs)
+
+
+__all__ = ["LlavaConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/llava/image_processing_llava.py b/venv/lib/python3.13/site-packages/transformers/models/llava/image_processing_llava.py
new file mode 100644
index 0000000000000000000000000000000000000000..5420d6fe291861951c7e6b5bfc555b81c57cbc90
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/llava/image_processing_llava.py
@@ -0,0 +1,437 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for LLaVa."""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    convert_to_rgb,
+    get_resize_output_image_size,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_flat_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_kwargs,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, is_vision_available, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_vision_available():
+    import PIL
+
+
+class LlavaImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a LLaVa image processor.
+
+    Args:
+        do_pad (`bool`, *optional*, defaults to `False`):
+            Whether to pad the image to a square based on the longest edge.
+            The padding value is determined by the `image_mean` parameter.
+            Can be overridden by `do_pad` in the `preprocess` method.
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in the `preprocess` method.
+        size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
+            Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
+            the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
+            `preprocess` method.
+        crop_size (`dict[str, int]` *optional*, defaults to 224):
+            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
+            method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_pad: bool = False,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_center_crop: bool = True,
+        crop_size: Optional[dict[str, int]] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 224}
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
+
+        self.do_pad = do_pad
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.do_convert_rgb = do_convert_rgb
+        self._valid_processor_keys = [
+            "images",
+            "do_pad",
+            "do_resize",
+            "size",
+            "resample",
+            "do_center_crop",
+            "crop_size",
+            "do_rescale",
+            "rescale_factor",
+            "do_normalize",
+            "image_mean",
+            "image_std",
+            "do_convert_rgb",
+            "return_tensors",
+            "data_format",
+            "input_data_format",
+        ]
+
+    def pad_to_square(
+        self,
+        image: np.ndarray,
+        background_color: Union[int, tuple[int, int, int]] = 0,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Pads an image to a square based on the longest edge.
+
+        Args:
+            image (`np.ndarray`):
+                The image to pad.
+            background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
+                The color to use for the padding. Can be an integer for single channel or a
+                tuple of integers representing for multi-channel images. If passed as integer
+                in multi-channel mode, it will default to `0` in subsequent channels.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. Can be one of:
+                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                If unset, will use same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                If unset, will use the inferred format of the input image.
+
+        Returns:
+            `np.ndarray`: The padded image.
+        """
+        height, width = get_image_size(image, input_data_format)
+        num_channels = image.shape[0] if input_data_format == ChannelDimension.FIRST else image.shape[-1]
+
+        if height == width:
+            image = (
+                to_channel_dimension_format(image, data_format, input_data_format)
+                if data_format is not None
+                else image
+            )
+            return image
+
+        max_dim = max(height, width)
+
+        # Ensure background_color is the correct shape
+        if isinstance(background_color, int):
+            background_color = [background_color]
+        elif len(background_color) != num_channels:
+            raise ValueError(
+                f"background_color must have no more than {num_channels} elements to match the number of channels"
+            )
+
+        if input_data_format == ChannelDimension.FIRST:
+            result = np.zeros((num_channels, max_dim, max_dim), dtype=image.dtype)
+            for i, color in enumerate(background_color):
+                result[i, :, :] = color
+            if width > height:
+                start = (max_dim - height) // 2
+                result[:, start : start + height, :] = image
+            else:
+                start = (max_dim - width) // 2
+                result[:, :, start : start + width] = image
+        else:
+            result = np.zeros((max_dim, max_dim, num_channels), dtype=image.dtype)
+            for i, color in enumerate(background_color):
+                result[:, :, i] = color
+            if width > height:
+                start = (max_dim - height) // 2
+                result[start : start + height, :, :] = image
+            else:
+                start = (max_dim - width) // 2
+                result[:, start : start + width, :] = image
+
+        image = (
+            to_channel_dimension_format(result, data_format, input_data_format) if data_format is not None else result
+        )
+        return image
+
+    # Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
+        resized to keep the input aspect ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        default_to_square = True
+        if "shortest_edge" in size:
+            size = size["shortest_edge"]
+            default_to_square = False
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
+
+        output_size = get_resize_output_image_size(
+            image,
+            size=size,
+            default_to_square=default_to_square,
+            input_data_format=input_data_format,
+        )
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_pad: Optional[bool] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_pad (`bool`, *optional*, defaults to `self.do_pad`):
+                Whether to pad the image to a square based on the longest edge.
+                The padding value is determined by the `image_mean` parameter.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image.
+            crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_pad = do_pad if do_pad is not None else self.do_pad
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, param_name="size", default_to_square=False)
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
+
+        images = self.fetch_images(images)
+        images = make_flat_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        # we don't pass `do_pad` here since LLaVa uses a custom padding to a square
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_center_crop=do_center_crop,
+            crop_size=crop_size,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        processed_images = []
+        for image in images:
+            if do_pad:
+                image = self.pad_to_square(
+                    image=image,
+                    background_color=tuple(int(x * 255) for x in self.image_mean),
+                    input_data_format=input_data_format,
+                )
+
+            if do_resize:
+                image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+
+            if do_center_crop:
+                image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format)
+
+            if do_rescale:
+                image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            processed_images.append(image)
+
+        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
+
+
+__all__ = ["LlavaImageProcessor"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/llava/image_processing_llava_fast.py b/venv/lib/python3.13/site-packages/transformers/models/llava/image_processing_llava_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..5960700405494208325f970ee3a1495cfddc6a87
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/llava/image_processing_llava_fast.py
@@ -0,0 +1,171 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for LLaVa."""
+
+from typing import Optional, Union
+
+import torch
+from torchvision.transforms.v2 import functional as F
+
+from ...image_processing_utils import BatchFeature
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    group_images_by_shape,
+    reorder_images,
+)
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    SizeDict,
+    get_image_size,
+)
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    auto_docstring,
+)
+
+
+class LlavaFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): ...
+
+
+@auto_docstring
+class LlavaImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BICUBIC
+    image_mean = OPENAI_CLIP_MEAN
+    image_std = OPENAI_CLIP_STD
+    size = {"shortest_edge": 224}
+    default_to_square = False
+    crop_size = {"height": 224, "width": 224}
+    do_pad = False
+    do_resize = True
+    do_center_crop = True
+    do_rescale = True
+    do_normalize = True
+    do_convert_rgb = True
+    valid_kwargs = LlavaFastImageProcessorKwargs
+
+    def __init__(self, **kwargs: Unpack[LlavaFastImageProcessorKwargs]) -> None:
+        super().__init__(**kwargs)
+
+    @auto_docstring
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaFastImageProcessorKwargs]) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+
+    def pad_to_square(
+        self,
+        images: "torch.Tensor",
+        background_color: Union[int, tuple[int, int, int]] = 0,
+    ) -> "torch.Tensor":
+        """
+        Pads an image to a square based on the longest edge.
+
+        Args:
+            images (`np.ndarray`):
+                The images to pad.
+            background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
+                The color to use for the padding. Can be an integer for single channel or a
+                tuple of integers representing for multi-channel images. If passed as integer
+                in multi-channel mode, it will default to `0` in subsequent channels.
+        Returns:
+            `torch.Tensor`: The padded images.
+        """
+        height, width = get_image_size(images, ChannelDimension.FIRST)
+
+        if height == width:
+            return images
+
+        num_channels = images.shape[1] if len(images.shape) == 4 else images.shape[0]
+        if isinstance(background_color, int):
+            background_color = [background_color] + [0] * (num_channels - 1)
+        elif len(background_color) != num_channels:
+            raise ValueError(
+                f"background_color must have no more than {num_channels} elements to match the number of channels"
+            )
+
+        max_dim = max(height, width)
+        paste_x_left = (max_dim - width) // 2
+        paste_y_left = (max_dim - height) // 2
+        paste_x_right = max_dim - width - paste_x_left
+        paste_y_right = max_dim - height - paste_y_left
+        padded_images = F.pad(
+            images, padding=[paste_x_left, paste_y_left, paste_x_right, paste_y_right], fill=background_color
+        )
+
+        return padded_images
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_pad: bool,
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> BatchFeature:
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_pad:
+                stacked_images = self.pad_to_square(
+                    images=stacked_images, background_color=tuple(int(x * 255) for x in self.image_mean)
+                )
+            resized_images_grouped[shape] = stacked_images
+        padded_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for batched resizing
+        # Needed in case do_pad is False, or padding returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(padded_images, disable_grouping=disable_grouping)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_center_crop:
+                stacked_images = self.center_crop(stacked_images, crop_size)
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+
+        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
+
+
+__all__ = ["LlavaImageProcessorFast"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/llava/modeling_llava.py b/venv/lib/python3.13/site-packages/transformers/models/llava/modeling_llava.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc0bb0df7c7baa60aaee0f2fd585102368bea1db
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/llava/modeling_llava.py
@@ -0,0 +1,484 @@
+# coding=utf-8
+# Copyright 2023 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Llava model."""
+
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache
+from ...generation import GenerationMixin
+from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ..auto import AutoModel
+from .configuration_llava import LlavaConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Llava outputs, with hidden states and attentions.
+    """
+)
+class LlavaModelOutputWithPast(BaseModelOutputWithPast):
+    r"""
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Llava causal language model (or autoregressive) outputs.
+    """
+)
+class LlavaCausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+class LlavaMultiModalProjector(nn.Module):
+    def __init__(self, config: LlavaConfig):
+        super().__init__()
+        # We have hidden_size * the number of vision feature layers
+        num_feature_layers = 1 if isinstance(config.vision_feature_layer, int) else len(config.vision_feature_layer)
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size * num_feature_layers,
+            config.text_config.hidden_size,
+            bias=config.multimodal_projector_bias,
+        )
+        self.act = ACT2FN[config.projector_hidden_act]
+        self.linear_2 = nn.Linear(
+            config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
+        )
+
+    def forward(self, image_features):
+        hidden_states = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+@auto_docstring
+class LlavaPreTrainedModel(PreTrainedModel):
+    config: LlavaConfig
+    base_model_prefix = ""
+    supports_gradient_checkpointing = True
+    _skip_keys_device_placement = "past_key_values"
+
+    _supports_flash_attn = True
+    _supports_sdpa = True
+
+    _can_compile_fullgraph = True
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+
+
+@auto_docstring(
+    custom_intro="""
+    The Llava model which consists of a vision backbone and a language model, without a language modeling head.
+    """
+)
+class LlavaModel(LlavaPreTrainedModel):
+    _checkpoint_conversion_mapping = {"language_model.model": "language_model"}
+
+    def __init__(self, config: LlavaConfig):
+        super().__init__(config)
+        self.vision_tower = AutoModel.from_config(config.vision_config)
+
+        self.multi_modal_projector = LlavaMultiModalProjector(config)
+        self.language_model = AutoModel.from_config(config.text_config)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+
+    def get_decoder(self):
+        return self.language_model
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        **kwargs,
+    ):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
+               The tensors corresponding to the input images.
+            vision_feature_layer (`Union[int, list[int]]`, *optional*):
+                The index of the layer to select the vision feature. If multiple indices are provided,
+                the vision feature of the corresponding indices will be concatenated to form the
+                vision features.
+            vision_feature_select_strategy (`str`, *optional*):
+                The feature selection strategy used to select the vision feature from the vision backbone.
+                Can be one of `"default"` or `"full"`
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+
+        if vision_feature_select_strategy not in ["default", "full"]:
+            raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
+
+        kwargs = {k: v for k, v in kwargs.items() if v is not None}
+        # this is not memory efficient at all (output_hidden_states=True) will save all the hidden states.
+        image_outputs = self.vision_tower(pixel_values, output_hidden_states=True, **kwargs)
+
+        # If we have one vision feature layer, return the corresponding hidden states,
+        # otherwise, select the hidden states of each feature layer and concatenate them
+        if isinstance(vision_feature_layer, int):
+            selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
+            if vision_feature_select_strategy == "default":
+                selected_image_feature = selected_image_feature[:, 1:]
+        else:
+            hs_pool = [image_outputs.hidden_states[layer_idx] for layer_idx in vision_feature_layer]
+            # For default; crop CLS from each hidden state in the hidden state pool
+            if vision_feature_select_strategy == "default":
+                hs_pool = [hs[:, 1:] for hs in hs_pool]
+            selected_image_feature = torch.cat(hs_pool, dim=-1)
+
+        image_features = self.multi_modal_projector(selected_image_feature)
+
+        if "image_sizes" in kwargs:
+            split_sizes = [
+                (height // self.vision_tower.patch_size) * (width // self.vision_tower.patch_size)
+                for height, width in kwargs["image_sizes"]
+            ]
+            image_features = torch.split(image_features.squeeze(0), split_sizes)
+        else:
+            image_features = list(image_features)
+        return image_features
+
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        n_image_features = image_features.shape[0] * image_features.shape[1]
+        if inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+            )
+        return special_image_mask
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, LlavaModelOutputWithPast]:
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None:
+            image_features = self.get_image_features(
+                pixel_values=pixel_values,
+                vision_feature_layer=vision_feature_layer,
+                vision_feature_select_strategy=vision_feature_select_strategy,
+                image_sizes=image_sizes,
+            )
+            image_features = torch.cat(image_features, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        return LlavaModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The LLAVA model which consists of a vision backbone and a language model.
+    """
+)
+class LlavaForConditionalGeneration(LlavaPreTrainedModel, GenerationMixin):
+    _checkpoint_conversion_mapping = {
+        "^language_model.model": "model.language_model",
+        "^vision_tower": "model.vision_tower",
+        "^multi_modal_projector": "model.multi_modal_projector",
+        "^language_model.lm_head": "lm_head",
+    }
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: LlavaConfig):
+        super().__init__(config)
+        self.model = LlavaModel(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.lm_head
+
+    def set_decoder(self, decoder):
+        self.model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        **kwargs,
+    ):
+        return self.model.get_image_features(
+            pixel_values=pixel_values,
+            vision_feature_layer=vision_feature_layer,
+            vision_feature_select_strategy=vision_feature_select_strategy,
+            **kwargs,
+        )
+
+    # Make modules available through conditional class for BC
+    @property
+    def language_model(self):
+        return self.model.language_model
+
+    @property
+    def vision_tower(self):
+        return self.model.vision_tower
+
+    @property
+    def multi_modal_projector(self):
+        return self.model.multi_modal_projector
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        labels: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        image_sizes: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, LlavaCausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, LlavaForConditionalGeneration
+
+        >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
+        >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
+
+        >>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
+        ```"""
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            vision_feature_layer=vision_feature_layer,
+            vision_feature_select_strategy=vision_feature_select_strategy,
+            cache_position=cache_position,
+            image_sizes=image_sizes,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+            )
+
+        return LlavaCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        attention_mask=None,
+        cache_position=None,
+        logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        if cache_position[0] == 0:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            model_inputs["pixel_values"] = pixel_values
+
+        return model_inputs
+
+
+__all__ = ["LlavaForConditionalGeneration", "LlavaPreTrainedModel", "LlavaModel"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/llava/processing_llava.py b/venv/lib/python3.13/site-packages/transformers/models/llava/processing_llava.py
new file mode 100644
index 0000000000000000000000000000000000000000..63c07c20cbb9974fb6d2597e88e5224cd0111a52
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/llava/processing_llava.py
@@ -0,0 +1,212 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Llava.
+"""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput, get_image_size, to_numpy_array
+from ...processing_utils import (
+    MultiModalData,
+    ProcessingKwargs,
+    ProcessorMixin,
+    Unpack,
+)
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class LlavaProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {"padding": False, "return_mm_token_type_ids": False},
+        "images_kwargs": {},
+    }
+
+
+class LlavaProcessor(ProcessorMixin):
+    r"""
+    Constructs a LLaVa processor which wraps a LLaVa image processor and a LLaMa tokenizer into a single processor.
+
+    [`LlavaProcessor`] offers all the functionalities of [`LlavaImageProcessor`] and [`LlamaTokenizerFast`]. See the
+    [`~LlavaProcessor.__call__`] and [`~LlavaProcessor.decode`] for more information.
+
+    Args:
+        image_processor ([`LlavaImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`LlamaTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        patch_size (`int`, *optional*):
+            Patch size from the vision tower.
+        vision_feature_select_strategy (`str`, *optional*):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Should be same as in model's config
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            Special token used to denote image location.
+        num_additional_image_tokens (`int`, *optional*, defaults to 0):
+            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
+            extra tokens appended, no need to set this arg.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        patch_size=None,
+        vision_feature_select_strategy=None,
+        chat_template=None,
+        image_token="<image>",  # set the default and let users change if they have peculiar special tokens in rare cases
+        num_additional_image_tokens=0,
+        **kwargs,
+    ):
+        self.patch_size = patch_size
+        self.num_additional_image_tokens = num_additional_image_tokens
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
+        self.image_token_id = tokenizer.encode(self.image_token, add_special_tokens=False)[0]
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[LlavaProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
+        of the above two methods for more information.
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        if images is None and text is None:
+            raise ValueError("You have to specify at least one of `images` or `text`.")
+
+        output_kwargs = self._merge_kwargs(
+            LlavaProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if images is not None:
+            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
+        else:
+            image_inputs = {}
+
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise TypeError("Invalid input text. Please provide a string, or a list of strings")
+
+        # try to expand inputs in processing if we have the necessary parts
+        prompt_strings = text
+        if image_inputs.get("pixel_values") is not None:
+            # Replace the image token with the expanded image token sequence
+            pixel_values = image_inputs["pixel_values"]
+            height, width = get_image_size(to_numpy_array(pixel_values[0]))
+            num_image_tokens = (height // self.patch_size) * (
+                width // self.patch_size
+            ) + self.num_additional_image_tokens
+            if self.vision_feature_select_strategy == "default":
+                num_image_tokens -= 1
+
+            prompt_strings = []
+            for sample in text:
+                sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
+                prompt_strings.append(sample)
+
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
+        text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"], return_tensors=None)
+        self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
+
+        if return_mm_token_type_ids:
+            array_ids = np.array(text_inputs["input_ids"])
+            mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
+            mm_token_type_ids[array_ids == self.image_token_id] = 1
+            text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
+
+        return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
+
+    def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
+        """
+        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
+
+        Args:
+            image_sizes (`list[list[int]]`, *optional*):
+                The input sizes formatted as (height, width) per each image.
+
+        Returns:
+            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+            input modalities, along with other useful data.
+        """
+
+        vision_data = {}
+        if image_sizes is not None:
+            images_kwargs = LlavaProcessorKwargs._defaults.get("images_kwargs", {})
+            images_kwargs.update(kwargs)
+            crop_size = images_kwargs.get("crop_size", None) or self.image_processor.crop_size
+            resized_height, resized_width = crop_size["height"], crop_size["width"]
+
+            num_image_tokens = (resized_height // self.patch_size) * (resized_width // self.patch_size)
+            num_image_tokens += self.num_additional_image_tokens
+            if self.vision_feature_select_strategy == "default":
+                num_image_tokens -= 1
+
+            num_image_tokens = [num_image_tokens] * len(image_sizes)
+            num_image_patches = [1] * len(image_sizes)
+            vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
+
+        return MultiModalData(**vision_data)
+
+
+__all__ = ["LlavaProcessor"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/llava_next_video/configuration_llava_next_video.py b/venv/lib/python3.13/site-packages/transformers/models/llava_next_video/configuration_llava_next_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..1eb1078b6aceb27128001eac79e0161396cc0387
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/llava_next_video/configuration_llava_next_video.py
@@ -0,0 +1,166 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/llava_next_video/modular_llava_next_video.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_llava_next_video.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...configuration_utils import PretrainedConfig
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+class LlavaNextVideoConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LlavaNextVideoForConditionalGeneration`]. It is used to instantiate an
+    Llava-NeXT model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the [llava-hf/LLaVA-NeXT-Video-7B-hf](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-hf)
+    model.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `CLIPVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
+            The config object or dictionary of the text backbone.
+        image_token_index (`int`, *optional*, defaults to 32001):
+            The image token index to encode the image prompt.
+        projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The activation function used by the multimodal projector.
+        multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the multimodal projector.
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features.
+            If `"full"`, the full vision features are used.
+        vision_feature_layer (`Union[int, list[int]]`, *optional*, defaults to -2):
+            The index of the layer to select the vision feature. If multiple indices are provided,
+            the vision feature of the corresponding indices will be concatenated to form the
+            vision features.
+        image_grid_pinpoints (`List`, *optional*, defaults to `[[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]`):
+            A list of possible resolutions to use for processing high resolution images. Each item in the list should be a tuple or list
+            of the form `(height, width)`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        video_token_index (`int`, *optional*, defaults to 32000):
+            The video token index to encode the image prompt.
+        spatial_pool_mode (`str`, *optional*, defaults to `"average"`):
+            Pooling mode to use for videos. Can be "average", "max" or "conv".
+        spatial_pool_stride (`int`, *optional*, defaults to 2):
+            Stride used in the pooling layer for videos.
+        image_seq_length (`int`, *optional*, defaults to 576):
+            Sequence length of one image embedding.
+        video_seq_length (`int`, *optional*, defaults to 288):
+            Sequence length of one video embedding.
+
+    Example:
+
+    ```python
+    >>> from transformers import LlavaNextVideoForConditionalGeneration, LlavaNextVideoConfig, CLIPVisionConfig, LlamaConfig
+
+    >>> # Initializing a CLIP-vision config
+    >>> vision_config = CLIPVisionConfig()
+
+    >>> # Initializing a Llama config
+    >>> text_config = LlamaConfig()
+
+    >>> configuration = LlavaNextVideoConfig(vision_config, text_config)
+
+    >>> model = LlavaNextVideoForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "llava_next_video"
+    attribute_map = {
+        "image_token_id": "image_token_index",
+        "video_token_id": "video_token_index",
+    }
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
+
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        image_token_index=32001,
+        projector_hidden_act="gelu",
+        multimodal_projector_bias=True,
+        vision_feature_select_strategy="default",
+        vision_feature_layer=-2,
+        image_grid_pinpoints=None,
+        tie_word_embeddings=False,
+        video_token_index=32000,
+        spatial_pool_mode="average",
+        spatial_pool_stride=2,
+        image_seq_length=576,
+        video_seq_length=288,
+        **kwargs,
+    ):
+        self.video_token_index = video_token_index
+        self.spatial_pool_mode = spatial_pool_mode
+        self.spatial_pool_stride = spatial_pool_stride
+        self.image_seq_length = image_seq_length
+        self.video_seq_length = video_seq_length
+        self.image_token_index = image_token_index
+        self.projector_hidden_act = projector_hidden_act
+        self.multimodal_projector_bias = multimodal_projector_bias
+
+        if vision_feature_select_strategy not in ["default", "full"]:
+            raise ValueError(
+                "vision_feature_select_strategy should be one of 'default', 'full'."
+                f"Got: {vision_feature_select_strategy}"
+            )
+
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.vision_feature_layer = vision_feature_layer
+        image_grid_pinpoints = (
+            image_grid_pinpoints
+            if image_grid_pinpoints is not None
+            else [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]
+        )
+        self.image_grid_pinpoints = image_grid_pinpoints
+
+        if isinstance(vision_config, dict):
+            vision_config["model_type"] = vision_config.get("model_type", "clip_vision_model")
+            vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+        elif vision_config is None:
+            vision_config = CONFIG_MAPPING["clip_vision_model"](
+                intermediate_size=4096,
+                hidden_size=1024,
+                patch_size=14,
+                image_size=336,
+                num_hidden_layers=24,
+                num_attention_heads=16,
+                vocab_size=32000,
+                projection_dim=768,
+            )
+
+        self.vision_config = vision_config
+
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config.get("model_type", "llama")
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["llama"]()
+
+        self.text_config = text_config
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+
+
+__all__ = ["LlavaNextVideoConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/llava_next_video/image_processing_llava_next_video.py b/venv/lib/python3.13/site-packages/transformers/models/llava_next_video/image_processing_llava_next_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba1cd30a1133fe285baf1b1faa3ee9aac0c97591
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/llava_next_video/image_processing_llava_next_video.py
@@ -0,0 +1,403 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for LLaVa-NeXT-Video."""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    convert_to_rgb,
+    get_resize_output_image_size,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_flat_list_of_images,
+    to_numpy_array,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, logging
+from ...video_utils import VideoInput, make_batched_videos
+
+
+logger = logging.get_logger(__name__)
+
+
+class LlavaNextVideoImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a LLaVa-NeXT-Video video processor. Based on [`CLIPImageProcessor`] with incorporation of processing each video frame.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in the `preprocess` method.
+        size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
+            Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
+            the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
+            method.
+        image_grid_pinpoints (`List` *optional*, defaults to `[[672, 336], [336, 672], [672, 672], [336, 1008], [1008, 336]]`):
+            A list of possible resolutions to use for processing high resolution images. The best resolution is selected
+            based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess`
+            method. Not used for processing videos.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
+            `preprocess` method.
+        crop_size (`dict[str, int]` *optional*, defaults to 224):
+            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
+            method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+
+    model_input_names = ["pixel_values_videos"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        image_grid_pinpoints: Optional[list] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_center_crop: bool = True,
+        crop_size: Optional[dict[str, int]] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 224}
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
+
+        self.do_resize = do_resize
+        self.size = size
+        self.image_grid_pinpoints = image_grid_pinpoints
+        self.resample = resample
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.do_convert_rgb = do_convert_rgb
+
+    # Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize with CLIP->LLaVa
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
+        resized to keep the input aspect ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        default_to_square = True
+        if "shortest_edge" in size:
+            size = size["shortest_edge"]
+            default_to_square = False
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
+
+        output_size = get_resize_output_image_size(
+            image,
+            size=size,
+            default_to_square=default_to_square,
+            input_data_format=input_data_format,
+        )
+
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def _preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> list[np.ndarray]:
+        """
+        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
+
+        Args:
+            images (`ImageInput`):
+                Batch of frames (one video) to preprocess. Expects a batch of frames with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image.
+            crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        images = make_flat_list_of_images(images)
+
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        all_images = []
+        for image in images:
+            if do_resize:
+                image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+
+            if do_center_crop:
+                image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format)
+
+            if do_rescale:
+                image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+
+            all_images.append(image)
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            for image in all_images
+        ]
+
+        return images
+
+    def preprocess(
+        self,
+        images: VideoInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Args:
+            images (`VideoInput`):
+                Videos to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the video.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the video after resizing. Shortest edge of the video is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the video. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the video.
+            crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the video.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the video by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the video.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Frame mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Frame standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the video to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, param_name="size", default_to_square=False)
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        images = self.fetch_images(images)
+        images = make_batched_videos(images)
+        logger.warning(
+            "`LlavaNextVideoImageProcessor` is deprecated and will be removed in v5.0. "
+            "We recommend to load an instance of `LlavaNextVideoVideoProcessor` to process videos for the model. "
+        )
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_center_crop=do_center_crop,
+            crop_size=crop_size,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        # preprocess each video frame by frame
+        pixel_values = [
+            self._preprocess(
+                frames,
+                do_resize=do_resize,
+                size=size,
+                resample=resample,
+                do_center_crop=do_center_crop,
+                crop_size=crop_size,
+                do_rescale=do_rescale,
+                rescale_factor=rescale_factor,
+                do_normalize=do_normalize,
+                image_mean=image_mean,
+                image_std=image_std,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            for frames in images
+        ]
+
+        data = {"pixel_values_videos": pixel_values}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["LlavaNextVideoImageProcessor"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/llava_next_video/modeling_llava_next_video.py b/venv/lib/python3.13/site-packages/transformers/models/llava_next_video/modeling_llava_next_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e3b15cea54826648d4b51bd02e2d8e9ac664b8a
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/llava_next_video/modeling_llava_next_video.py
@@ -0,0 +1,981 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/llava_next_video/modular_llava_next_video.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_llava_next_video.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import numpy as np
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache
+from ...generation import GenerationMixin
+from ...image_processing_utils import select_best_resolution
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ..auto import AutoModel
+from .configuration_llava_next_video import LlavaNextVideoConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Llava outputs, with hidden states and attentions.
+    """
+)
+class LlavaNextVideoModelOutputWithPast(BaseModelOutputWithPast):
+    r"""
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    video_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor`  of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`.
+        video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+    video_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for LlavaNextVideo causal language model (or autoregressive) outputs.
+    """
+)
+class LlavaNextVideoCausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size (batch_size * num_patches, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    video_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor`  of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`.
+        video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+    video_hidden_states: Optional[torch.FloatTensor] = None
+
+
+class LlavaNextVideoPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        mode = config.spatial_pool_mode
+        stride = config.spatial_pool_stride
+        out_channels = getattr(config, "spatial_pool_out_channels", config.vision_config.hidden_size)
+        self.image_size = (config.vision_config.image_size // config.vision_config.patch_size) ** 2
+
+        if mode == "average":
+            self.pool = nn.AvgPool2d(kernel_size=stride, stride=stride)
+        elif mode == "max":
+            self.pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
+        elif mode == "conv":
+            self.pool = nn.Conv2d(
+                in_channels=config.vision_config.hidden_size,
+                out_channels=out_channels,
+                kernel_size=stride,
+                stride=stride,
+            )
+        else:
+            raise ValueError(f"Unknown pooling mode: {mode}. Has to be one of [`average`, `max`, `conv`]")
+
+    def forward(self, image_features):
+        ori_width = int(math.sqrt(image_features.shape[1] * self.image_size // self.image_size))
+        ori_height = int(ori_width * self.image_size // self.image_size)
+
+        batch_size, _, dim = image_features.shape
+        image_features_spatial = image_features.view(batch_size, ori_height, ori_height, dim).permute(0, 3, 1, 2)
+        image_features_spatial_pool = self.pool(image_features_spatial)
+
+        return image_features_spatial_pool.flatten(2).transpose(1, 2).contiguous()
+
+
+class LlavaNextVideoMultiModalProjector(nn.Module):
+    def __init__(self, config: LlavaNextVideoConfig):
+        super().__init__()
+        # We have hidden_size * the number of vision feature layers
+        num_feature_layers = 1 if isinstance(config.vision_feature_layer, int) else len(config.vision_feature_layer)
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size * num_feature_layers,
+            config.text_config.hidden_size,
+            bias=config.multimodal_projector_bias,
+        )
+        self.act = ACT2FN[config.projector_hidden_act]
+        self.linear_2 = nn.Linear(
+            config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
+        )
+
+    def forward(self, image_features):
+        hidden_states = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+@auto_docstring
+class LlavaNextVideoPreTrainedModel(PreTrainedModel):
+    config: LlavaNextVideoConfig
+    base_model_prefix = ""
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["LlamaDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+
+    _supports_flash_attn = True
+    _supports_sdpa = True
+
+    _can_compile_fullgraph = True
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+
+    def _init_weights(self, module):
+        std = getattr(self.config, "initializer_range", self.config.get_text_config().initializer_range)
+
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, LlavaNextVideoModel):
+            embed_std = 1 / math.sqrt(self.config.text_config.hidden_size)
+            module.image_newline.data.normal_(mean=0.0, std=embed_std)
+
+
+def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
+    """
+    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
+
+    Args:
+        image_size (`tuple`):
+            The size of the input image in the format (width, height).
+        grid_pinpoints (`List`):
+            A list containing possible resolutions. Each item in the list should be a tuple or list
+            of the form `(height, width)`.
+        patch_size (`int`):
+            The size of each image patch.
+
+    Returns:
+        tuple: The shape of the image patch grid in the format (width, height).
+    """
+    if not isinstance(grid_pinpoints, list):
+        raise TypeError("grid_pinpoints should be a list of tuples or lists")
+
+    # ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate
+    if not isinstance(image_size, (list, tuple)):
+        if not isinstance(image_size, (torch.Tensor, np.ndarray)):
+            raise TypeError(
+                f"image_size invalid type: {type(image_size)} not valid, should be either list, tuple, np.ndarray or tensor"
+            )
+        image_size = image_size.tolist()
+
+    height, width = select_best_resolution(image_size, grid_pinpoints)
+    return height // patch_size, width // patch_size
+
+
+def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int):
+    """
+    Calculate the number of patches after the preprocessing for images of any resolution.
+
+    Args:
+        image_size (`torch.LongTensor` or `np.ndarray` or `tuple[int, int]`):
+            The size of the input image in the format (height, width). ?
+        grid_pinpoints (`List`):
+            A list containing possible resolutions. Each item in the list should be a tuple or list
+            of the form `(height, width)`.
+        patch_size (`int`):
+            The size of each image patch.
+
+    Returns:
+        int: the number of patches
+    """
+    if not isinstance(grid_pinpoints, list):
+        raise TypeError("grid_pinpoints should be a list of tuples or lists")
+
+    # ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate
+    if not isinstance(image_size, (list, tuple)):
+        if not isinstance(image_size, (torch.Tensor, np.ndarray)):
+            raise TypeError(f"image_size invalid type {type(image_size)} with value {image_size}")
+        image_size = image_size.tolist()
+
+    best_resolution = select_best_resolution(image_size, grid_pinpoints)
+    height, width = best_resolution
+    num_patches = 0
+    # consider change to ceil(height/patch_size)*ceil(width/patch_size) + 1
+    for i in range(0, height, patch_size):
+        for j in range(0, width, patch_size):
+            num_patches += 1
+    # add the base patch
+    num_patches += 1
+    return num_patches
+
+
+def unpad_image(tensor, original_size):
+    """
+    Unpads a PyTorch tensor of a padded and resized image.
+
+    Args:
+        tensor (`torch.Tensor`):
+            The image tensor, assumed to be of shape (num_channels, height, width).
+        original_size (`tuple`):
+            The original size of the image (height, width).
+
+    Returns:
+        `torch.Tensor`: The unpadded image tensor.
+    """
+    if not isinstance(original_size, (list, tuple)):
+        if not isinstance(original_size, (torch.Tensor, np.ndarray)):
+            raise TypeError(
+                f"image_size invalid type: {type(original_size)} not valid, should be either list, tuple, np.ndarray or tensor"
+            )
+        original_size = original_size.tolist()
+    original_height, original_width = original_size
+    current_height, current_width = tensor.shape[1:]
+
+    original_aspect_ratio = original_width / original_height
+    current_aspect_ratio = current_width / current_height
+
+    if original_aspect_ratio > current_aspect_ratio:
+        scale_factor = current_width / original_width
+        new_height = int(round(original_height * scale_factor, 7))
+        padding = (current_height - new_height) // 2
+        unpadded_tensor = tensor[:, padding : current_height - padding, :]
+    else:
+        scale_factor = current_height / original_height
+        new_width = int(round(original_width * scale_factor, 7))
+        padding = (current_width - new_width) // 2
+        unpadded_tensor = tensor[:, :, padding : current_width - padding]
+
+    return unpadded_tensor
+
+
+@auto_docstring(
+    custom_intro="""
+    The Llava-Next model which consists of a vision backbone and a language model without language modeling head.
+    """
+)
+class LlavaNextVideoModel(LlavaNextVideoPreTrainedModel):
+    _checkpoint_conversion_mapping = {"language_model.model": "language_model"}
+
+    def __init__(
+        self,
+        config: LlavaNextVideoConfig,
+    ):
+        super().__init__(config)
+        self.vision_tower = AutoModel.from_config(config.vision_config)
+
+        self.multi_modal_projector = LlavaNextVideoMultiModalProjector(config)
+        embed_std = 1 / math.sqrt(config.text_config.hidden_size)
+        self.image_newline = nn.Parameter(torch.randn(config.text_config.hidden_size, dtype=self.dtype) * embed_std)
+
+        self.vocab_size = config.text_config.vocab_size
+        self.language_model = AutoModel.from_config(config.text_config)
+        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
+        self.vision_resampler = LlavaNextVideoPooler(config)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+
+    def get_decoder(self):
+        return self.language_model
+
+    def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None):
+        """
+        Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.
+
+        Args:
+            image_features (`list[torch.Tensor]` of length num_images, each of shape `(num_patches, image_length, embed_dim)`)
+                List of image feature tensor, each contains all the visual feature of all patches.
+            image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
+                Actual image size of each images (H, W).
+            vision_feature_select_strategy (`str`)
+                The feature selection strategy used to select the vision feature from the vision backbone.
+            image_newline (`torch.Tensor` of shape `(embed_dim)`)
+                New line embedding vector.
+        Returns:
+            image_features (`torch.Tensor` of shape `(all_feat_len, embed_dim)`)
+            feature_lens (`list[int]`)
+                token length of each image in image_features
+        """
+        new_image_features = []
+        feature_lens = []
+        for image_idx, image_feature in enumerate(image_features):
+            if image_feature.shape[0] > 1:
+                base_image_feature = image_feature[0]
+                image_feature = image_feature[1:]
+                height = width = self.config.vision_config.image_size // self.config.vision_config.patch_size
+
+                num_patch_height, num_patch_width = get_anyres_image_grid_shape(
+                    image_sizes[image_idx],
+                    self.config.image_grid_pinpoints,
+                    self.config.vision_config.image_size,
+                )
+
+                if (
+                    np.prod(image_feature.shape) % (num_patch_height * num_patch_width * height * width) != 0
+                    and vision_feature_select_strategy == "default"
+                ):
+                    logger.warning_once(
+                        "Image feature shape does not line up with the provided patch size. "
+                        "You may be using the `default` vision_feature_select_strategy with a"
+                        " visual encoder that does not have CLS."
+                    )
+
+                image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
+                image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+                image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+                image_feature = unpad_image(image_feature, image_sizes[image_idx])
+                if image_newline is not None:
+                    image_feature = torch.cat(
+                        (
+                            image_feature,
+                            image_newline[:, None, None]
+                            .expand(*image_feature.shape[:-1], 1)
+                            .to(image_feature.device, image_feature.dtype),
+                        ),
+                        dim=-1,
+                    )
+                image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+                image_feature = torch.cat((base_image_feature, image_feature), dim=0)
+            else:
+                image_feature = image_feature[0]
+                if image_newline is not None:
+                    image_feature = torch.cat((image_feature, image_newline[None].to(image_feature)), dim=0)
+            new_image_features.append(image_feature)
+            feature_lens.append(image_feature.size(0))
+        feature_lens = torch.tensor(feature_lens, dtype=torch.long, device=image_features[0].device)
+        return new_image_features, feature_lens
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        image_sizes: torch.Tensor,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+    ):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`)
+               The tensors corresponding to the input images.
+            image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
+                Actual image size of each images (H, W).
+            vision_feature_layer (`Union[int, list[int]]`, *optional*):
+                The index of the layer to select the vision feature. If multiple indices are provided,
+                the vision feature of the corresponding indices will be concatenated to form the
+                vision features.
+            vision_feature_select_strategy (`str`, *optional*):
+                The feature selection strategy used to select the vision feature from the vision backbone.
+                Can be one of `"default"` or `"full"`
+        Returns:
+            image_features (list[`torch.Tensor`]): List of image feature tensor, each contains all the visual feature of all patches
+            and are of shape `(num_patches, image_length, embed_dim)`).
+        """
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+
+        # ! infer image_num_patches from image_sizes
+        image_num_patches = [
+            image_size_to_num_patches(
+                image_size=imsize,
+                grid_pinpoints=self.config.image_grid_pinpoints,
+                patch_size=self.config.vision_config.image_size,
+            )
+            for imsize in image_sizes
+        ]
+        if pixel_values.dim() == 5:
+            # stacked if input is (batch_size, num_patches, num_channels, height, width)
+            _pixel_values_list = [pix_val[:num_patch] for pix_val, num_patch in zip(pixel_values, image_num_patches)]
+            pixel_values = torch.cat(_pixel_values_list, dim=0)
+        elif pixel_values.dim() != 4:
+            # otherwise has to be stacked from list of (num_patches, num_channels, height, width)
+            raise ValueError(f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions")
+
+        image_features = self.vision_tower(pixel_values, output_hidden_states=True)
+        # If we have one vision feature layer, return the corresponding hidden states,
+        # otherwise, select the hidden states of each feature layer and concatenate them
+        if isinstance(vision_feature_layer, int):
+            selected_image_feature = image_features.hidden_states[vision_feature_layer]
+        else:
+            hs_pool = [image_features.hidden_states[layer_idx] for layer_idx in vision_feature_layer]
+            selected_image_feature = torch.cat(hs_pool, dim=-1)
+
+        if vision_feature_select_strategy == "default":
+            selected_image_feature = selected_image_feature[:, 1:]
+        image_features = self.multi_modal_projector(selected_image_feature)
+        image_features = torch.split(image_features, image_num_patches, dim=0)
+
+        image_features, feature_lens = self.pack_image_features(
+            image_features,
+            image_sizes,
+            vision_feature_select_strategy,
+            image_newline=self.image_newline,
+        )
+        return image_features
+
+    def get_placeholder_mask(
+        self,
+        input_ids: torch.LongTensor,
+        inputs_embeds: torch.FloatTensor,
+        image_features: Optional[torch.FloatTensor] = None,
+        video_features: Optional[torch.FloatTensor] = None,
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+            special_video_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_video_mask = special_video_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+            special_video_mask = input_ids == self.config.video_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if image_features is not None and inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.shape[0]}"
+            )
+
+        n_video_tokens = special_video_mask.sum()
+        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if video_features is not None and inputs_embeds[special_video_mask].numel() != video_features.numel():
+            raise ValueError(
+                f"Videos features and image tokens do not match: tokens: {n_video_tokens}, features {video_features.shape[0]}"
+            )
+
+        return special_image_mask, special_video_mask
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, LlavaNextVideoModelOutputWithPast]:
+        r"""
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features.
+            If `"full"`, the full vision features are used.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        self.vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        self.vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None:
+            image_features = self.get_image_features(
+                pixel_values,
+                image_sizes,
+                vision_feature_layer=self.vision_feature_layer,
+                vision_feature_select_strategy=self.vision_feature_select_strategy,
+            )
+            image_features = torch.cat(image_features, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask, _ = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        if pixel_values_videos is not None:
+            video_features = self.get_video_features(
+                pixel_values_videos,
+                vision_feature_layer=self.vision_feature_layer,
+                vision_feature_select_strategy=self.vision_feature_select_strategy,
+            )
+            video_features = [feature.flatten(0, 1) for feature in video_features]
+            video_feature_lens = [feature.size(0) for feature in video_features]
+            video_features = torch.cat(video_features, dim=0)
+            video_feature_lens = torch.tensor(video_feature_lens, dtype=torch.long, device=video_features.device)
+            video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
+
+            _, special_video_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, video_features=video_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_video_mask, video_features)
+
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        return LlavaNextVideoModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+            video_hidden_states=video_features if pixel_values_videos is not None else None,
+        )
+
+    def get_video_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+    ):
+        """
+        Obtains video last hidden states from the vision tower and apply multimodal projection.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`)
+               The tensors corresponding to the input video.
+            vision_feature_layer (`Union[int, list[int]]`, *optional;*):
+                The index of the layer to select the vision feature. If multiple indices are provided,
+                the vision feature of the corresponding indices will be concatenated to form the
+                vision features.
+            vision_feature_select_strategy (`str`, *optional*):
+                The feature selection strategy used to select the vision feature from the vision backbone.
+                Can be one of `"default"` or `"full"`
+        Returns:
+            video_features (list[`torch.Tensor`]): List of video feature tensor, each contains all the visual feature of all patches
+            and are of shape `(num_videos, video_length, embed_dim)`).
+        """
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+
+        batch_size, frames, channels, height, width = pixel_values.shape
+        pixel_values = pixel_values.reshape(batch_size * frames, channels, height, width)
+        video_features = self.vision_tower(pixel_values, output_hidden_states=True)
+
+        # If we have one vision feature layer, return the corresponding hidden states,
+        # otherwise, select the hidden states of each feature layer and concatenate them
+        if isinstance(vision_feature_layer, int):
+            selected_video_features = video_features.hidden_states[vision_feature_layer]
+        else:
+            hs_pool = [video_features.hidden_states[layer_idx] for layer_idx in vision_feature_layer]
+            selected_video_features = torch.cat(hs_pool, dim=-1)
+
+        if vision_feature_select_strategy == "default":
+            selected_video_features = selected_video_features[:, 1:]
+
+        # Same as image features except that video has pooling layer
+        video_features = self.vision_resampler(selected_video_features)
+        video_features = self.multi_modal_projector(video_features)
+        video_features = torch.split(video_features, frames, dim=0)
+        return video_features
+
+
+@auto_docstring(
+    custom_intro="""
+    The LLAVA-NeXT model which consists of a vision backbone and a language model.
+    """
+)
+class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel, GenerationMixin):
+    _checkpoint_conversion_mapping = {
+        "^language_model.model": "model.language_model",
+        "^vision_tower": "model.vision_tower",
+        "^multi_modal_projector": "model.multi_modal_projector",
+        "^image_newline": "model.image_newline",
+        "^language_model.lm_head": "lm_head",
+    }
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: LlavaNextVideoConfig):
+        super().__init__(config)
+        self.model = LlavaNextVideoModel(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.lm_head
+
+    def set_decoder(self, decoder):
+        self.model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None):
+        return self.model.pack_image_features(
+            image_features=image_features,
+            image_sizes=image_sizes,
+            vision_feature_select_strategy=vision_feature_select_strategy,
+            image_newline=image_newline,
+        )
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        image_sizes: torch.Tensor,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+    ):
+        return self.model.get_image_features(
+            pixel_values=pixel_values,
+            image_sizes=image_sizes,
+            vision_feature_layer=vision_feature_layer,
+            vision_feature_select_strategy=vision_feature_select_strategy,
+        )
+
+    # Make modules available through conditional class for BC
+    @property
+    def language_model(self):
+        return self.model.language_model
+
+    @property
+    def vision_tower(self):
+        return self.model.vision_tower
+
+    @property
+    def multi_modal_projector(self):
+        return self.model.multi_modal_projector
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, LlavaNextVideoCausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> import av
+        >>> from transformers import AutoProcessor, LlavaNextVideoForConditionalGeneration
+
+        >>> def read_video_pyav(container, indices):
+        ...     '''
+        ...     Decode the video with PyAV decoder.
+        ...     Args:
+        ...         container (`av.container.input.InputContainer`): PyAV container.
+        ...         indices (`list[int]`): List of frame indices to decode.
+        ...     Returns:
+        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+        ...     '''
+        ...     frames = []
+        ...     container.seek(0)
+        ...     start_index = indices[0]
+        ...     end_index = indices[-1]
+        ...     for i, frame in enumerate(container.decode(video=0)):
+        ...         if i > end_index:
+        ...             break
+        ...         if i >= start_index and i in indices:
+        ...             frames.append(frame)
+        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+        >>> model = LlavaNextVideoForConditionalGeneration.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf", device_map="auto")
+        >>> processor = AutoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")
+
+        >>> prompt = "USER: <video>\nWhy is this video funny? ASSISTANT:"
+        >>> video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset")
+        >>> container = av.open(video_path)
+
+        >>> # sample uniformly 8 frames from the video (model was trained with 32 frames per video, but this video is short)
+        >>> total_frames = container.streams.video[0].frames
+        >>> indices = np.arange(0, total_frames, total_frames / 8).astype(int)
+        >>> clip = read_video_pyav(container, indices)
+        >>> inputs_video = processor(text=prompt, videos=clip, return_tensors="pt").to(model.device)
+
+        >>> # load an image to generate from an image
+        >>> prompt = "USER:<image>\nWhat is shown in this image? ASSISTANT:"
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs_image = processor(text=prompt, images=image, return_tensors="pt").to(model.device)
+
+        >>> # Generate from video
+        >>> generate_ids = model.generate(**inputs_video, max_length=50)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "USER:\nWhy is this video funny? ASSISTANT: The humor in this video comes from the unexpected and endearing sight of a baby wearing glasses and (...)"
+
+        >>> # Generate from image
+        >>> generate_ids = model.generate(**inputs_image, max_length=30)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "USER: \nWhat's the content of the image? ASSISTANT: The image shows a red stop sign on a pole, with a traditional Chinese archway (...)"
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            vision_feature_layer=vision_feature_layer,
+            vision_feature_select_strategy=vision_feature_select_strategy,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            image_sizes=image_sizes,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+            )
+
+        return LlavaNextVideoCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+            video_hidden_states=outputs.video_hidden_states,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        pixel_values_videos=None,
+        image_sizes=None,
+        attention_mask=None,
+        cache_position=None,
+        logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- extra custom processing
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+        # Otherwise we need pixel values to be passed to model
+        if cache_position[0] == 0:
+            model_inputs["pixel_values"] = pixel_values
+            model_inputs["pixel_values_videos"] = pixel_values_videos
+            model_inputs["image_sizes"] = image_sizes
+
+        return model_inputs
+
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+    def get_video_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+    ):
+        return self.model.get_video_features(
+            pixel_values=pixel_values,
+            vision_feature_layer=vision_feature_layer,
+            vision_feature_select_strategy=vision_feature_select_strategy,
+        )
+
+
+__all__ = ["LlavaNextVideoForConditionalGeneration", "LlavaNextVideoModel", "LlavaNextVideoPreTrainedModel"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/llava_next_video/video_processing_llava_next_video.py b/venv/lib/python3.13/site-packages/transformers/models/llava_next_video/video_processing_llava_next_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..80ed1e5b81b9c71d045c3d5918050062ddc84e51
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/llava_next_video/video_processing_llava_next_video.py
@@ -0,0 +1,45 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Video processor class for LLaVa-NeXT-Video."""
+
+from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling
+from ...processing_utils import Unpack, VideosKwargs
+from ...video_processing_utils import BaseVideoProcessor
+
+
+class LlavaNextVideoFastVideoProcessorInitKwargs(VideosKwargs): ...
+
+
+class LlavaNextVideoVideoProcessor(BaseVideoProcessor):
+    resample = PILImageResampling.BICUBIC
+    image_mean = OPENAI_CLIP_MEAN
+    image_std = OPENAI_CLIP_STD
+    size = {"shortest_edge": 224}
+    default_to_square = False
+    crop_size = {"height": 224, "width": 224}
+    do_resize = True
+    do_center_crop = True
+    do_rescale = True
+    do_normalize = True
+    do_convert_rgb = True
+    do_sample_frames = False  # Set to False for BC, recommended to set `True` in new models
+    valid_kwargs = LlavaNextVideoFastVideoProcessorInitKwargs
+    model_input_names = ["pixel_values_videos"]
+
+    def __init__(self, **kwargs: Unpack[LlavaNextVideoFastVideoProcessorInitKwargs]):
+        super().__init__(**kwargs)
+
+
+__all__ = ["LlavaNextVideoVideoProcessor"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/pegasus_x/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/pegasus_x/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d362bacf491ea32dcbc89e7b8bba4d9ae1a9261
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/pegasus_x/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_pegasus_x import *
+    from .modeling_pegasus_x import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/pegasus_x/configuration_pegasus_x.py b/venv/lib/python3.13/site-packages/transformers/models/pegasus_x/configuration_pegasus_x.py
new file mode 100644
index 0000000000000000000000000000000000000000..626389c448b86f64995b825910bde4b1bde543cb
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/pegasus_x/configuration_pegasus_x.py
@@ -0,0 +1,177 @@
+# coding=utf-8
+# Copyright 2022, Google and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PEGASUS-X model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class PegasusXConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PegasusXModel`]. It is used to instantiate a
+    PEGASUS-X model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the PEGASUS-X
+    [google/pegasus-x-large](https://huggingface.co/google/pegasus-x-large) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 96103):
+            Vocabulary size of the PEGASUS-X model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`PegasusXModel`].
+        d_model (`int`, *optional*, defaults to 1024):
+            Dimension of the layers and the pooler layer.
+        encoder_layers (`int`, *optional*, defaults to 16):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 16):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        max_position_embeddings (`int`, *optional*, defaults to 16384):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556)
+            for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556)
+            for more details.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models)
+        forced_eos_token_id (`int`, *optional*, defaults to 1):
+            The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+            `eos_token_id`.
+        num_global_tokens (`int`, *optional*, defaults to 128):
+            Number of global tokens to use for the encoder
+        block_size (`int`, *optional*, defaults to 512):
+            Block size for encoder local attention. Sequence length should be an exact multiple of block size.
+            block_size must be a multiple of 2 if stagger_local_block is True
+        stagger_local_block (`bool`, *optional*, defaults to `True`):
+            Whether to stagger every other local attention by half a block
+
+    Example:
+
+    ```python
+    >>> from transformers import PegasusXConfig, PegasusXModel
+
+    >>> # Initializing a PEGASUS google/pegasus-x-large style configuration
+    >>> configuration = PegasusXConfig()
+
+    >>> # Initializing a model (with random weights) from the google/pegasus-x-large style configuration
+    >>> model = PegasusXModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "pegasus_x"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
+
+    def __init__(
+        self,
+        vocab_size=96103,
+        max_position_embeddings=16384,
+        encoder_layers=16,
+        encoder_ffn_dim=4096,
+        encoder_attention_heads=16,
+        decoder_layers=16,
+        decoder_ffn_dim=4096,
+        decoder_attention_heads=16,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        use_cache=True,
+        is_encoder_decoder=True,
+        activation_function="gelu",
+        d_model=1024,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        decoder_start_token_id=0,
+        scale_embedding=True,
+        pad_token_id=0,
+        eos_token_id=1,
+        forced_eos_token_id=1,
+        num_global_tokens=32,
+        block_size=512,
+        stagger_local_blocks=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+
+        self.num_global_tokens = num_global_tokens
+        self.block_size = block_size
+        self.stagger_local_blocks = stagger_local_blocks
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            forced_eos_token_id=forced_eos_token_id,
+            **kwargs,
+        )
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
+
+
+__all__ = ["PegasusXConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/pegasus_x/modeling_pegasus_x.py b/venv/lib/python3.13/site-packages/transformers/models/pegasus_x/modeling_pegasus_x.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c1ae32cabe26ae9f05811731ad7deabe6b82bdd
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/pegasus_x/modeling_pegasus_x.py
@@ -0,0 +1,1717 @@
+# coding=utf-8
+# Copyright 2022, Google and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch PEGASUS-X model."""
+
+import math
+from dataclasses import dataclass
+from typing import Callable, Optional, Union
+
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import (
+    AttentionMaskConverter,
+    _prepare_4d_attention_mask,
+    _prepare_4d_attention_mask_for_sdpa,
+)
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+)
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import auto_docstring, is_torch_flex_attn_available, is_torchdynamo_compiling, logging
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_pegasus_x import PegasusXConfig
+
+
+if is_torch_flex_attn_available():
+    from ...integrations.flex_attention import BlockMask, make_flex_block_causal_mask
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+class DimensionInfo:
+    """Wrapper for dimension info."""
+
+    batch_size: int  # batch size
+    seq_len: int  # token length
+    block_size: int  # block size
+    num_heads: int  # num heads
+    hidden_dim: int  # hidden dim
+    dim_per_head: int  # dim per head
+    num_blocks: int  # num blocks
+    global_len: int  # global length
+    padded_seq_len: int  # padded token seq length
+
+    # Note: Compared to the original Flax implementation, we will pad the token representations to
+    #       a multiple of block size at the start of the encoder layers, so T=P always.
+
+
+# Copied from transformers.models.bart.modeling_bart.shift_tokens_right
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    if pad_token_id is None:
+        raise ValueError("self.model.config.pad_token_id has to be defined.")
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
+# Copied from transformers.models.bart.modeling_bart.BartScaledWordEmbedding with Bart->PegasusX
+class PegasusXScaledWordEmbedding(nn.Embedding):
+    """
+    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: Optional[float] = 1.0):
+        super().__init__(num_embeddings, embedding_dim, padding_idx)
+        self.embed_scale = embed_scale
+
+    def forward(self, input_ids: torch.Tensor):
+        return super().forward(input_ids) * self.embed_scale
+
+
+class PegasusXSinusoidalPositionalEmbedding(nn.Module):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    def __init__(self, embed_dim, max_scale: int = 10000.0):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.max_scale = max_scale
+
+    @torch.no_grad()
+    def forward(
+        self, input_embeds: torch.Tensor, past_key_values_length: int = 0, position_ids: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        batch_size, seq_len = input_embeds.shape[:2]
+        if position_ids is None:
+            position_ids = torch.arange(
+                past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=input_embeds.device
+            )[:, None]
+
+        pe = torch.zeros((seq_len, self.embed_dim), device=input_embeds.device, dtype=input_embeds.dtype)
+        half_d_feature = self.embed_dim // 2
+        div_term = torch.exp(
+            torch.arange(half_d_feature, device=input_embeds.device, dtype=torch.int64).type_as(input_embeds)
+            * -(np.log(float(self.max_scale)) / (half_d_feature - 1))
+        )
+        pe[:, :half_d_feature] = torch.sin(position_ids * div_term)
+        pe[:, half_d_feature:] = torch.cos(position_ids * div_term)
+        return pe[None].expand(batch_size, -1, -1)
+
+
+# Copied from transformers.models.bart.modeling_bart.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: Optional[float] = None,
+    dropout: float = 0.0,
+    head_mask: Optional[torch.Tensor] = None,
+    **kwargs,
+):
+    if scaling is None:
+        scaling = query.size(-1) ** -0.5
+
+    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+    if head_mask is not None:
+        attn_weights = attn_weights * head_mask.view(1, -1, 1, 1)
+
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->PegasusX
+class PegasusXAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        is_causal: bool = False,
+        config: Optional[PegasusXConfig] = None,
+        layer_idx: Optional[int] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.config = config
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.is_causal = is_causal
+        self.layer_idx = layer_idx
+        if layer_idx is None and self.is_decoder:
+            logger.warning_once(
+                f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and "
+                "will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.Tensor] = None,
+        # TODO: we need a refactor so that the different attention modules can get their specific kwargs
+        # ATM, we have mixed things encoder, decoder, and encoder-decoder attn
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        # determine input shapes
+        bsz, tgt_len = hidden_states.shape[:-1]
+        src_len = key_value_states.shape[1] if is_cross_attention else tgt_len
+
+        q_input_shape = (bsz, tgt_len, -1, self.head_dim)
+        kv_input_shape = (bsz, src_len, -1, self.head_dim)
+
+        # get query proj
+        query_states = self.q_proj(hidden_states).view(*q_input_shape).transpose(1, 2)
+
+        is_updated = False
+        if past_key_values is not None:
+            if isinstance(past_key_values, EncoderDecoderCache):
+                is_updated = past_key_values.is_updated.get(self.layer_idx)
+                if is_cross_attention:
+                    # after the first generated id, we can subsequently re-use all key/value_states from cache
+                    curr_past_key_value = past_key_values.cross_attention_cache
+                else:
+                    curr_past_key_value = past_key_values.self_attention_cache
+            else:
+                curr_past_key_value = past_key_values
+
+        current_states = key_value_states if is_cross_attention else hidden_states
+        if is_cross_attention and past_key_values is not None and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = curr_past_key_value.layers[self.layer_idx].keys
+            value_states = curr_past_key_value.layers[self.layer_idx].values
+        else:
+            key_states = self.k_proj(current_states)
+            value_states = self.v_proj(current_states)
+            key_states = key_states.view(*kv_input_shape).transpose(1, 2)
+            value_states = value_states.view(*kv_input_shape).transpose(1, 2)
+
+            if past_key_values is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = curr_past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache):
+                    past_key_values.is_updated[self.layer_idx] = True
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.dropout,
+            scaling=self.scaling,
+            output_attentions=output_attentions,
+            head_mask=layer_head_mask,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous()
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+class PegasusXGlobalLocalAttention(nn.Module):
+    """Global + Local attention. For use with Encoder only."""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        block_size: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.block_size = block_size
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        token_hidden_states: torch.Tensor,
+        global_hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+        dim = DimensionInfo(
+            batch_size=token_hidden_states.shape[0],
+            seq_len=token_hidden_states.shape[1],
+            block_size=self.block_size,
+            num_heads=self.num_heads,
+            hidden_dim=token_hidden_states.shape[2],
+            dim_per_head=self.head_dim,
+            num_blocks=token_hidden_states.shape[1] // self.block_size,
+            global_len=global_hidden_states.shape[1],
+            padded_seq_len=token_hidden_states.shape[1],
+        )
+
+        # [batch_size, num_heads, padded_seq_len, dim_per_head]
+        local_q = self._shape(
+            self.q_proj(token_hidden_states) * self.scaling,
+            seq_len=dim.padded_seq_len,
+            bsz=dim.batch_size,
+        )
+        local_k = self._shape(
+            self.k_proj(token_hidden_states),
+            seq_len=dim.padded_seq_len,
+            bsz=dim.batch_size,
+        )
+        local_v = self._shape(
+            self.v_proj(token_hidden_states),
+            seq_len=dim.padded_seq_len,
+            bsz=dim.batch_size,
+        )
+
+        # [batch_size, num_heads, global_len, dim_per_head]
+        global_q = self._shape(
+            self.q_proj(global_hidden_states) * self.scaling,
+            seq_len=dim.global_len,
+            bsz=dim.batch_size,
+        )
+        global_k = self._shape(
+            self.k_proj(global_hidden_states),
+            seq_len=dim.global_len,
+            bsz=dim.batch_size,
+        )
+        global_v = self._shape(
+            self.v_proj(global_hidden_states),
+            seq_len=dim.global_len,
+            bsz=dim.batch_size,
+        )
+
+        global_attn_output, global_attn_probs = self.compute_global_attention_representations(
+            global_q=global_q,
+            global_k=global_k,
+            global_v=global_v,
+            local_k=local_k,
+            local_v=local_v,
+            mask=attention_mask,
+            dim=dim,
+        )
+        local_attn_output, local_attn_probs = self.compute_local_attention_representations(
+            global_k=global_k,
+            global_v=global_v,
+            local_q=local_q,
+            local_k=local_k,
+            local_v=local_v,
+            mask=attention_mask,
+            dim=dim,
+        )
+
+        # [batch_size, global_len, hidden_dim]
+        global_attn_output = (
+            global_attn_output.transpose(1, 2).contiguous().view(dim.batch_size, dim.global_len, dim.hidden_dim)
+        )
+        # [batch_size, global_len, hidden_dim]
+        global_attn_output = self.out_proj(global_attn_output)
+        # [batch_size, num_heads, block_size, num_heads, dim_per_head]
+        local_attn_output = local_attn_output.permute(0, 2, 3, 1, 4).contiguous()
+        # [batch_size, padded_seq_len, hidden_dim]
+        local_attn_output = local_attn_output.view(dim.batch_size, dim.padded_seq_len, dim.hidden_dim)
+        # [batch_size, padded_seq_len, hidden_dim]
+        local_attn_output = self.out_proj(local_attn_output)
+
+        if output_attentions:
+            attn_probs = {"global": global_attn_probs, "local": local_attn_probs}
+        else:
+            attn_probs = None
+
+        return local_attn_output, global_attn_output, attn_probs
+
+    def compute_global_attention_representations(
+        self, global_q, global_k, global_v, local_k, local_v, mask, dim: DimensionInfo
+    ):
+        """Compute attention representations for global tokens.
+
+        Global tokens will attend to both global tokens as well as all input sequence tokens. Because the input
+        sequence tokens are arranged in blocks for local attention, we unblock them and compute attention.
+
+        Args:
+            global_q (`torch.FloatTensor`) of shape [batch_size, num_heads, global_len, dim_per_head]:
+                query vectors from global tokens
+            global_k (`torch.FloatTensor`) of shape [batch_size, num_heads, global_len, dim_per_head]:
+                key vectors from global tokens
+            global_v (`torch.FloatTensor`) of shape [batch_size, num_heads, global_len, dim_per_head]:
+                value vectors from global tokens
+            local_k (`torch.FloatTensor`) of shape [batch_size, num_heads, padded_seq_len, dim_per_head]:
+                key vectors from local tokens
+            local_v (`torch.FloatTensor`) of shape [batch_size, num_heads, padded_seq_len, dim_per_head]:
+                value vectors from local tokens
+            mask (`torch.FloatTensor`) of shape [batch_size, padded_seq_len]: attention mask
+            dim (DimensionInfo): DimensionInfo wrapper for dimensions
+
+        Returns:
+            output of shape `[batch_sizes, length, features]`. where length will be padded to a multiple of block_size
+        """
+        # [batch_size, num_heads, global_len+padded_seq_len, dim_per_head]
+        global_and_local_k = torch.cat([global_k, local_k], dim=2)
+        # [batch_size, num_heads, global_len+padded_seq_len, dim_per_head]
+        global_and_local_v = torch.cat([global_v, local_v], dim=2)
+
+        # [batch_size, global_len+padded_seq_len]
+        extended_mask = nn.functional.pad(mask, pad=(dim.global_len, 0), value=0)
+
+        # [batch_size, num_heads, global_len, global_len+padded_seq_len]
+        attn_weights = torch.einsum("BHGF,BHXF->BHGX", global_q, global_and_local_k)
+        attn_weights = attn_weights + extended_mask[:, None, None, :]
+        attn_probs = nn.functional.softmax(attn_weights, dim=-1)
+        attn_probs = nn.functional.dropout(attn_probs, p=self.dropout, training=self.training)
+
+        # [batch_size, num_heads, global_len, F]
+        attn_output = torch.einsum("BHGX,BHXF->BHGF", attn_probs, global_and_local_v)
+        return attn_output, attn_probs
+
+    def compute_local_attention_representations(
+        self, global_k, global_v, local_q, local_k, local_v, mask, dim: DimensionInfo
+    ):
+        """Compute attention representations for local tokens.
+
+        Local tokens will attend to both global tokens as well as all other tokens within the same local block. Hence,
+        we need to tile and concatenate the global tokens to every local block
+
+        Args:
+            global_k (`torch.FloatTensor`) of shape [batch_size, num_heads, global_len, dim_per_head]:
+                key vectors from global tokens
+            global_v (`torch.FloatTensor`) of shape [batch_size, num_heads, global_len, dim_per_head]:
+                value vectors from global tokens
+            local_q (`torch.FloatTensor`) of shape [batch_size, num_heads, padded_seq_len, dim_per_head]:
+                query vectors from local tokens
+            local_k (`torch.FloatTensor`) of shape [batch_size, num_heads, padded_seq_len, dim_per_head]:
+                key vectors from local tokens
+            local_v (`torch.FloatTensor`) of shape [batch_size, num_heads, padded_seq_len, dim_per_head]:
+                value vectors from local tokens
+            mask (`torch.FloatTensor`) of shape [batch_size, padded_seq_len]: attention mask
+            dim (DimensionInfo): DimensionInfo wrapper for dimensions
+
+        Returns:
+            output of shape `[batch_sizes, length, features]`. where length will be padded to a multiple of block_size
+        """
+        # [batch_size, num_heads, num_blocks, block_size, dim_per_head]
+        blocked_local_q = local_q.view(dim.batch_size, dim.num_heads, dim.num_blocks, dim.block_size, dim.dim_per_head)
+        # [batch_size, num_heads, num_blocks, block_size, dim_per_head]
+        blocked_local_k = local_k.view(dim.batch_size, dim.num_heads, dim.num_blocks, dim.block_size, dim.dim_per_head)
+        # [batch_size, num_heads, num_blocks, block_size, dim_per_head]
+        blocked_local_v = local_v.view(dim.batch_size, dim.num_heads, dim.num_blocks, dim.block_size, dim.dim_per_head)
+
+        # [batch_size, num_blocks, global_len+block_size]
+        extended_mask = nn.functional.pad(
+            mask.view(dim.batch_size, dim.num_blocks, dim.block_size),
+            pad=(dim.global_len, 0),
+            value=0,
+        )
+
+        # [batch_size, num_heads, num_blocks, block_size, global_len]
+        blocked_local2global = torch.einsum("BHNKF,BHGF->BHNKG", blocked_local_q, global_k)
+        # [batch_size, num_heads, num_blocks, block_size, block_size]
+        blocked_local2local = torch.einsum("BHNKF,BHNXF->BHNKX", blocked_local_q, blocked_local_k)
+
+        # [batch_size, num_heads, num_blocks, block_size, global_len+block_size]
+        attn_weights = torch.cat([blocked_local2global, blocked_local2local], dim=-1)
+        attn_weights = attn_weights + extended_mask[:, None, :, None, :]
+        attn_probs = nn.functional.softmax(attn_weights, dim=-1)
+        attn_probs = nn.functional.dropout(attn_probs, p=self.dropout, training=self.training)
+
+        # [batch_size, num_heads, num_blocks, block_size, global_len]
+        local2global_attn_probs = attn_probs[:, :, :, :, : dim.global_len]
+        # [batch_size, num_heads, num_blocks, block_size, block_size]
+        local2local_attn_probs = attn_probs[:, :, :, :, dim.global_len :]
+
+        # [batch_size, num_heads, num_blocks, block_size, dim_per_head]
+        local2global_attn_output = torch.einsum("BHNKG,BHGF->BHNKF", local2global_attn_probs, global_v)
+        # [batch_size, num_heads, num_blocks, block_size, dim_per_head]
+        local2local_attn_output = torch.einsum("BHNKX,BHNXF->BHNKF", local2local_attn_probs, blocked_local_v)
+        # [batch_size, num_heads, num_blocks, block_size, dim_per_head]
+        attn_output = local2global_attn_output + local2local_attn_output
+        return attn_output, attn_probs
+
+
+class PegasusXEncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, stagger_blocks_this_layer: bool, config: PegasusXConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = PegasusXGlobalLocalAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            block_size=config.block_size,
+            dropout=config.attention_dropout,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.global_self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.stagger_blocks_this_layer = stagger_blocks_this_layer
+        self.block_size = config.block_size
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        global_hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: bool = False,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            global_hidden_states (`torch.FloatTensor`): global token hidden states
+                *(seq_len, num_global_tokens, embed_dim)*
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        global_residual = global_hidden_states
+
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        global_hidden_states = self.global_self_attn_layer_norm(global_hidden_states)
+
+        if self.stagger_blocks_this_layer:
+            # Pad the blocks to simulate staggering
+            hidden_states, attention_mask = self.pad_local_tokens(
+                hidden_states=hidden_states, attention_mask=attention_mask, block_size=self.block_size
+            )
+
+        hidden_states, global_hidden_states, attn_weights = self.self_attn(
+            token_hidden_states=hidden_states,
+            global_hidden_states=global_hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+
+        if self.stagger_blocks_this_layer:
+            # Undo the padding
+            hidden_states = self.unpad_local_tokens(padded_hidden_states=hidden_states, block_size=self.block_size)
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        global_hidden_states = nn.functional.dropout(global_hidden_states, p=self.dropout, training=self.training)
+        global_hidden_states = global_residual + global_hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        global_residual = global_hidden_states
+        global_hidden_states = self.final_layer_norm(global_hidden_states)
+        global_hidden_states = self.activation_fn(self.fc1(global_hidden_states))
+        global_hidden_states = nn.functional.dropout(
+            global_hidden_states, p=self.activation_dropout, training=self.training
+        )
+        global_hidden_states = self.fc2(global_hidden_states)
+        global_hidden_states = nn.functional.dropout(global_hidden_states, p=self.dropout, training=self.training)
+        global_hidden_states = global_residual + global_hidden_states
+        outputs = (hidden_states, global_hidden_states)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+    @classmethod
+    def pad_local_tokens(cls, hidden_states, attention_mask, block_size):
+        # hidden_states: [batch_size, seq_len, hidden_dim]
+        pad_size = block_size // 2
+        mask_min_value = torch.finfo(hidden_states.dtype).min
+        padded_hidden_states = torch.nn.functional.pad(
+            hidden_states,
+            pad=(0, 0, pad_size, pad_size),
+        )
+        padded_mask = torch.nn.functional.pad(
+            attention_mask,
+            pad=(pad_size, pad_size),
+            value=mask_min_value,
+        )
+        return padded_hidden_states, padded_mask
+
+    @classmethod
+    def unpad_local_tokens(cls, padded_hidden_states, block_size):
+        # padded_hidden_states: [batch_size, padded seq_len, hidden_dim]
+        pad_size = block_size // 2
+        return padded_hidden_states[:, pad_size:-pad_size, :]
+
+
+class PegasusXDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: PegasusXConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = PegasusXAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            bias=False,
+            config=config,
+            layer_idx=layer_idx,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = PegasusXAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            bias=False,
+            config=config,
+            layer_idx=layer_idx,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+            past_key_values (`Cache`): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache: Whether to us KV cache for decoding
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence. It is used to update the
+                cache in the correct position and to infer the complete sequence length.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            hidden_states, cross_attn_weights = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+        return outputs
+
+
+@auto_docstring
+class PegasusXPreTrainedModel(PreTrainedModel):
+    config: PegasusXConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = [r"PegasusXEncoderLayer", r"PegasusXDecoderLayer"]
+    _supports_flash_attn = True
+    # Flaky logits
+    _supports_sdpa = False
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+        elif isinstance(module, nn.LayerNorm):
+            module.weight.data.fill_(1.0)
+            module.bias.data.zero_()
+
+    # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_full_mask
+    def _update_full_mask(
+        self,
+        attention_mask: Union[torch.Tensor, None],
+        inputs_embeds: torch.Tensor,
+    ):
+        if attention_mask is not None:
+            if self.config._attn_implementation == "flash_attention_2":
+                attention_mask = attention_mask if 0 in attention_mask else None
+            elif self.config._attn_implementation == "sdpa":
+                # output_attentions=True & head_mask can not be supported when using SDPA, fall back to
+                # the manual implementation that requires a 4D causal mask in all cases.
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype)
+            elif self.config._attn_implementation == "flex_attention":
+                if isinstance(attention_mask, torch.Tensor):
+                    attention_mask = make_flex_block_causal_mask(attention_mask, is_causal=False)
+            else:
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
+
+        return attention_mask
+
+    # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_causal_mask
+    def _update_causal_mask(
+        self,
+        attention_mask: Optional[Union[torch.Tensor, "BlockMask"]],
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+    ):
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            # Other attention flavors support in-built causal (when `mask is None`)
+            # while we need to create our specific block mask regardless
+            elif attention_mask is None:
+                attention_mask = make_flex_block_causal_mask(
+                    torch.ones(
+                        size=(input_tensor.shape[0], input_tensor.shape[1]),
+                        device=attention_mask.device,
+                    )
+                )
+            return attention_mask
+
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_compilable_cache = past_key_values.is_compileable if past_key_values is not None else False
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_compilable_cache:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype = input_tensor.dtype
+        sequence_length = input_tensor.shape[1]
+        if using_compilable_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._prepare_4d_causal_attention_mask_with_cache_position
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+    # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_cross_attn_mask
+    def _update_cross_attn_mask(
+        self,
+        encoder_hidden_states: Union[torch.Tensor, None],
+        encoder_attention_mask: Union[torch.Tensor, None],
+        input_shape: torch.Size,
+        inputs_embeds: torch.Tensor,
+    ):
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            if self.config._attn_implementation == "flash_attention_2":
+                encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None
+            elif self.config._attn_implementation == "sdpa":
+                # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
+                # the manual implementation that requires a 4D causal mask in all cases.
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    encoder_attention_mask,
+                    inputs_embeds.dtype,
+                    tgt_len=input_shape[-1],
+                )
+            elif self.config._attn_implementation == "flex_attention":
+                if isinstance(encoder_attention_mask, torch.Tensor):
+                    encoder_attention_mask = make_flex_block_causal_mask(
+                        encoder_attention_mask,
+                        query_length=input_shape[-1],
+                        is_causal=False,
+                    )
+            else:
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                encoder_attention_mask = _prepare_4d_attention_mask(
+                    encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+                )
+
+        return encoder_attention_mask
+
+
+class PegasusXEncoder(PegasusXPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`PegasusXEncoderLayer`].
+
+    Args:
+        config: PegasusXConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: PegasusXConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+
+        embed_dim = config.d_model
+        padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_position_embeddings
+        embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = PegasusXScaledWordEmbedding(
+                config.vocab_size, embed_dim, padding_idx, embed_scale=embed_scale
+            )
+
+        self.embed_global = nn.Embedding(config.num_global_tokens, embed_dim)
+        self.embed_positions = PegasusXSinusoidalPositionalEmbedding(embed_dim)
+        self.layers = nn.ModuleList(
+            [
+                PegasusXEncoderLayer(
+                    stagger_blocks_this_layer=i % 2 == 1 and config.stagger_local_blocks, config=config
+                )
+                for i in range(config.encoder_layers)
+            ]
+        )
+        self.layer_norm = nn.LayerNorm(config.d_model)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def resize_position_embeddings(self, new_num_position_embeddings: int):
+        """
+        Resizes position embeddings matrix of the model if `new_num_position_embeddings !=
+        config.max_position_embeddings`.
+
+        Arguments:
+            new_num_position_embeddings (`int`):
+                The number of new position embeddings. If position embeddings are learned, increasing the size will add
+                newly initialized vectors at the end, whereas reducing the size will remove vectors from the end. If
+                position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the size will
+                add correct vectors at the end following the position encoding algorithm, whereas reducing the size
+                will remove vectors from the end.
+        """
+        logger.info(f"Setting `config.max_position_embeddings={new_num_position_embeddings}`...")
+        self.config.max_position_embeddings = new_num_position_embeddings
+
+        self.embed_positions = PegasusXSinusoidalPositionalEmbedding(self.config.d_model)
+        self.embed_positions.to(self.device)
+
+    def get_position_embeddings(self) -> nn.Embedding:
+        """
+        Returns the position embeddings matrix
+        """
+        return self.embed_positions
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        embed_pos = self.embed_positions(inputs_embeds)
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        batch_size, seq_len, _ = hidden_states.shape
+
+        # Setup mask
+        if attention_mask is None:
+            attention_mask = torch.ones(*input_shape, dtype=inputs_embeds.dtype, device=inputs_embeds.device)
+        attention_mask = attention_mask.to(dtype=hidden_states.dtype)
+        mask_min_value = torch.finfo(hidden_states.dtype).min
+        inverted_mask = 1.0 - attention_mask
+        attention_mask = inverted_mask.masked_fill(
+            inverted_mask.to(torch.bool),
+            mask_min_value,
+        )
+
+        # padding to block_size
+        if seq_len % self.config.block_size != 0:
+            pad_len = self.config.block_size - seq_len % self.config.block_size
+            hidden_states = nn.functional.pad(hidden_states, pad=(0, 0, 0, pad_len), value=0)
+            attention_mask = nn.functional.pad(attention_mask, pad=(0, pad_len), value=mask_min_value)
+
+        # Global tokens
+        global_hidden_states = self.embed_global(
+            torch.arange(self.config.num_global_tokens, device=hidden_states.device)[None].expand(batch_size, -1)
+        )
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
+                layer_outputs = (None, None)
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    global_hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+                hidden_states = layer_outputs[0]
+                global_hidden_states = layer_outputs[1]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[2],)
+
+        # Undo padding-to-block-size
+        hidden_states = hidden_states[:, :seq_len]
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + ((hidden_states, global_hidden_states),)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class PegasusXDecoder(PegasusXPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`PegasusDecoderLayer`]
+
+    Args:
+        config: PegasusXConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: PegasusXConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.max_target_positions = config.max_position_embeddings
+        embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        padding_idx = config.pad_token_id
+
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = PegasusXScaledWordEmbedding(
+                config.vocab_size, config.d_model, padding_idx=padding_idx, embed_scale=embed_scale
+            )
+
+        self.embed_positions = PegasusXSinusoidalPositionalEmbedding(config.d_model)
+        self.layers = nn.ModuleList([PegasusXDecoderLayer(config, layer_idx=i) for i in range(config.decoder_layers)])
+        self.layer_norm = nn.LayerNorm(config.d_model)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        cache_position=None,
+    ):
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+
+            past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            inputs_embeds (`torch.FloatTensor` of
+                shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing
+                `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more
+                control over how to convert `input_ids` indices into associated vectors than the model's internal
+                embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence. It is used to update the
+                cache in the correct position and to infer the complete sequence length.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input = input_ids
+            input_shape = input.shape
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            input = inputs_embeds[:, :, -1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input)
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # initialize `past_key_values`
+        if use_cache and past_key_values is None:
+            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
+        if use_cache and isinstance(past_key_values, tuple):
+            logger.warning_once(
+                "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. "
+                "You should pass an instance of `EncoderDecoderCache` instead, e.g. "
+                "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`."
+            )
+            past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
+
+        batch_size, seq_length = inputs_embeds.size()[:-1]
+        past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+        if cache_position is None:
+            cache_position = torch.arange(
+                past_key_values_length, past_key_values_length + seq_length, device=inputs_embeds.device
+            )
+
+        if attention_mask is None and not is_torchdynamo_compiling():
+            # required mask seq length can be calculated via length of past cache
+            mask_seq_length = past_key_values_length + seq_length
+            attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
+
+        self_attn_cache = (
+            past_key_values.self_attention_cache
+            if isinstance(past_key_values, EncoderDecoderCache)
+            else past_key_values
+        )
+
+        causal_mask = self._update_causal_mask(
+            attention_mask,
+            inputs_embeds,
+            cache_position,
+            self_attn_cache,
+        )
+        encoder_attention_mask = self._update_cross_attn_mask(
+            encoder_hidden_states,
+            encoder_attention_mask,
+            input_shape,
+            inputs_embeds,
+        )
+
+        # embed positions
+        position_ids = cache_position.unsqueeze(1)
+        position_ids = self.embed_positions(inputs_embeds, past_key_values_length, position_ids)
+        position_ids = position_ids.to(inputs_embeds.device)
+        hidden_states = inputs_embeds + position_ids
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                causal_mask,
+                encoder_hidden_states,  # as a positional argument for gradient checkpointing
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, past_key_values, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@auto_docstring
+class PegasusXModel(PegasusXPreTrainedModel):
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+
+    def __init__(self, config: PegasusXConfig):
+        super().__init__(config)
+
+        vocab_size = config.vocab_size
+        embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        padding_idx = config.pad_token_id
+        self.shared = PegasusXScaledWordEmbedding(
+            vocab_size, config.d_model, padding_idx=padding_idx, embed_scale=embed_scale
+        )
+
+        self.encoder = PegasusXEncoder(config, self.shared)
+        self.decoder = PegasusXDecoder(config, self.shared)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, value):
+        self.shared = value
+        self.encoder.embed_tokens = self.shared
+        self.decoder.embed_tokens = self.shared
+
+    def get_encoder(self):
+        return self.encoder
+
+    def resize_position_embeddings(self, new_num_position_embeddings: int):
+        """
+        Resizes position embeddings matrix of the model if `new_num_position_embeddings !=
+        config.max_position_embeddings`.
+
+        Arguments:
+            new_num_position_embeddings (`int`):
+                The number of new position embeddings. If position embeddings are learned, increasing the size will add
+                newly initialized vectors at the end, whereas reducing the size will remove vectors from the end. If
+                position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the size will
+                add correct vectors at the end following the position encoding algorithm, whereas reducing the size
+                will remove vectors from the end.
+        """
+        self.config.max_position_embeddings = new_num_position_embeddings
+        self.encoder.resize_position_embeddings(new_num_position_embeddings)
+        self.decoder.resize_position_embeddings(new_num_position_embeddings)
+
+    def get_position_embeddings(self) -> tuple[nn.Embedding]:
+        """
+        Returns the position embeddings matrix
+        """
+        return (self.encoder.get_position_embeddings(), self.decoder.get_position_embeddings())
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.Tensor] = None,
+        decoder_attention_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[tuple[torch.FloatTensor]] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        decoder_inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[tuple, Seq2SeqModelOutput]:
+        r"""
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            PEGASUS-X uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
+            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, PegasusModel
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/pegasus-x-large")
+        >>> model = PegasusModel.from_pretrained("google/pegasus-x-large")
+
+        >>> inputs = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt")
+        >>> decoder_inputs = tokenizer("Studies show that", return_tensors="pt")
+        >>> outputs = model(input_ids=inputs.input_ids, decoder_input_ids=decoder_inputs.input_ids)
+
+        >>> last_hidden_states = outputs.last_hidden_state
+        >>> list(last_hidden_states.shape)
+        [1, 4, 1024]
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The PEGASUS-X for conditional generation (e.g. summarization).
+    """
+)
+class PegasusXForConditionalGeneration(PegasusXPreTrainedModel, GenerationMixin):
+    base_model_prefix = "model"
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
+
+    def __init__(self, config: PegasusXConfig):
+        super().__init__(config)
+        self.model = PegasusXModel(config)
+        self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def resize_position_embeddings(self, new_num_position_embeddings: int):
+        """
+        Resizes position embeddings matrix of the model if `new_num_position_embeddings !=
+        config.max_position_embeddings`.
+
+        Arguments:
+            new_num_position_embeddings (`int`):
+                The number of new position embeddings. If position embeddings are learned, increasing the size will add
+                newly initialized vectors at the end, whereas reducing the size will remove vectors from the end. If
+                position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the size will
+                add correct vectors at the end following the position encoding algorithm, whereas reducing the size
+                will remove vectors from the end.
+        """
+        self.config.max_position_embeddings = new_num_position_embeddings
+        self.model.encoder.resize_position_embeddings(new_num_position_embeddings)
+        self.model.decoder.resize_position_embeddings(new_num_position_embeddings)
+
+    def get_position_embeddings(self) -> tuple[nn.Embedding]:
+        """
+        Returns the position embeddings matrix
+        """
+        return (self.model.encoder.get_position_embeddings(), self.model.decoder.get_position_embeddings())
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.Tensor] = None,
+        decoder_attention_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[tuple[torch.FloatTensor]] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        decoder_inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[tuple, Seq2SeqLMOutput]:
+        r"""
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            PEGASUS-X uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
+            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        lm_logits = self.lm_head(outputs[0])
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+
+
+# Copied from transformers.models.bart.modeling_bart.BartDecoderWrapper with Bart->PegasusX
+class PegasusXDecoderWrapper(PegasusXPreTrainedModel):
+    """
+    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
+    used in combination with the [`EncoderDecoderModel`] framework.
+    """
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.decoder = PegasusXDecoder(config)
+
+    def forward(self, *args, **kwargs):
+        return self.decoder(*args, **kwargs)
+
+
+__all__ = ["PegasusXForConditionalGeneration", "PegasusXModel", "PegasusXPreTrainedModel"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/phobert/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/phobert/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c14d34aff61362159e0b45e5895ceb08850faf5
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/phobert/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .tokenization_phobert import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/phobert/tokenization_phobert.py b/venv/lib/python3.13/site-packages/transformers/models/phobert/tokenization_phobert.py
new file mode 100644
index 0000000000000000000000000000000000000000..61ac8194b45cdd0caf8c3d680e97cebf8dadf4cc
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/phobert/tokenization_phobert.py
@@ -0,0 +1,351 @@
+# coding=utf-8
+# Copyright (c) 2020, VinAI Research and the HuggingFace Inc. team.
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for PhoBERT"""
+
+import os
+import re
+from shutil import copyfile
+from typing import Optional
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.txt",
+    "merges_file": "bpe.codes",
+}
+
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+
+    pairs = set(pairs)
+    return pairs
+
+
+class PhobertTokenizer(PreTrainedTokenizer):
+    """
+    Construct a PhoBERT tokenizer. Based on Byte-Pair-Encoding.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        bos_token (`st`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        **kwargs,
+    ):
+        self.vocab_file = vocab_file
+        self.merges_file = merges_file
+
+        self.encoder = {}
+        self.encoder[str(bos_token)] = 0
+        self.encoder[str(pad_token)] = 1
+        self.encoder[str(eos_token)] = 2
+        self.encoder[str(unk_token)] = 3
+
+        self.add_from_file(vocab_file)
+
+        self.decoder = {v: k for k, v in self.encoder.items()}
+
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            merges = merges_handle.read().split("\n")[:-1]
+        merges = [tuple(merge.split()[:-1]) for merge in merges]
+
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {}
+
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A PhoBERT sequence has the following format:
+
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. PhoBERT does not
+        make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of zeros.
+        """
+
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = "@@ ".join(word)
+        word = word[:-4]
+        self.cache[token] = word
+        return word
+
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        split_tokens = []
+
+        words = re.findall(r"\S+\n?", text)
+
+        for token in words:
+            split_tokens.extend(list(self.bpe(token).split(" ")))
+        return split_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        out_string = " ".join(tokens).replace("@@ ", "").strip()
+        return out_string
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        out_merge_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        if os.path.abspath(self.merges_file) != os.path.abspath(out_merge_file):
+            copyfile(self.merges_file, out_merge_file)
+
+        return out_vocab_file, out_merge_file
+
+    # def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
+    #     filtered_tokens = ' '.join(self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens))
+    #     tokens_generated_so_far = re.sub('(@@ )', '', string=filtered_tokens)
+    #     tokens_generated_so_far = re.sub('(@@ ?$)', '', string=tokens_generated_so_far)
+    #     return ''.join(tokens_generated_so_far)
+
+    def add_from_file(self, f):
+        """
+        Loads a pre-existing dictionary from a text file and adds its symbols to this instance.
+        """
+        if isinstance(f, str):
+            try:
+                with open(f, "r", encoding="utf-8") as fd:
+                    self.add_from_file(fd)
+            except FileNotFoundError as fnfe:
+                raise fnfe
+            except UnicodeError:
+                raise Exception(f"Incorrect encoding detected in {f}, please rebuild the dataset")
+            return
+
+        lines = f.readlines()
+        for lineTmp in lines:
+            line = lineTmp.strip()
+            idx = line.rfind(" ")
+            if idx == -1:
+                raise ValueError("Incorrect dictionary format, expected '<token> <cnt>'")
+            word = line[:idx]
+            self.encoder[word] = len(self.encoder)
+
+
+__all__ = ["PhobertTokenizer"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/pvt_v2/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/pvt_v2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3cb83f130dd42eb08943d5d4e08ed9321e6f076
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/pvt_v2/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_pvt_v2 import *
+    from .modeling_pvt_v2 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/pvt_v2/configuration_pvt_v2.py b/venv/lib/python3.13/site-packages/transformers/models/pvt_v2/configuration_pvt_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..167c5674ac2c87eb593b27ce2d008fc805e6ad96
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/pvt_v2/configuration_pvt_v2.py
@@ -0,0 +1,156 @@
+# coding=utf-8
+# Copyright 2024 Authors: Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan,
+# Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Pvt V2 model configuration"""
+
+from typing import Callable, Union
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+logger = logging.get_logger(__name__)
+
+
+class PvtV2Config(BackboneConfigMixin, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PvtV2Model`]. It is used to instantiate a Pvt V2
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Pvt V2 B0
+    [OpenGVLab/pvt_v2_b0](https://huggingface.co/OpenGVLab/pvt_v2_b0) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        image_size (`Union[int, tuple[int, int]]`, *optional*, defaults to 224):
+            The input image size. Pass int value for square image, or tuple of (height, width).
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        num_encoder_blocks (`[int]`, *optional*, defaults to 4):
+            The number of encoder blocks (i.e. stages in the Mix Transformer encoder).
+        depths (`list[int]`, *optional*, defaults to `[2, 2, 2, 2]`):
+            The number of layers in each encoder block.
+        sr_ratios (`list[int]`, *optional*, defaults to `[8, 4, 2, 1]`):
+            Spatial reduction ratios in each encoder block.
+        hidden_sizes (`list[int]`, *optional*, defaults to `[32, 64, 160, 256]`):
+            Dimension of each of the encoder blocks.
+        patch_sizes (`list[int]`, *optional*, defaults to `[7, 3, 3, 3]`):
+            Patch size for overlapping patch embedding before each encoder block.
+        strides (`list[int]`, *optional*, defaults to `[4, 2, 2, 2]`):
+            Stride for overlapping patch embedding before each encoder block.
+        num_attention_heads (`list[int]`, *optional*, defaults to `[1, 2, 5, 8]`):
+            Number of attention heads for each attention layer in each block of the Transformer encoder.
+        mlp_ratios (`list[int]`, *optional*, defaults to `[8, 8, 4, 4]`):
+            Ratio of the size of the hidden layer compared to the size of the input layer of the Mix FFNs in the
+            encoder blocks.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            The dropout probability for stochastic depth, used in the blocks of the Transformer encoder.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not a learnable bias should be added to the queries, keys and values.
+        linear_attention (`bool`, *optional*, defaults to `False`):
+            Use linear attention complexity. If set to True, `sr_ratio` is ignored and average pooling is used for
+            dimensionality reduction in the attention layers rather than strided convolution.
+        out_features (`list[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage.
+        out_indices (`list[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage.
+    Example:
+
+    ```python
+    >>> from transformers import PvtV2Model, PvtV2Config
+
+    >>> # Initializing a pvt_v2_b0 style configuration
+    >>> configuration = PvtV2Config()
+
+    >>> # Initializing a model from the OpenGVLab/pvt_v2_b0 style configuration
+    >>> model = PvtV2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "pvt_v2"
+
+    def __init__(
+        self,
+        image_size: Union[int, tuple[int, int]] = 224,
+        num_channels: int = 3,
+        num_encoder_blocks: int = 4,
+        depths: list[int] = [2, 2, 2, 2],
+        sr_ratios: list[int] = [8, 4, 2, 1],
+        hidden_sizes: list[int] = [32, 64, 160, 256],
+        patch_sizes: list[int] = [7, 3, 3, 3],
+        strides: list[int] = [4, 2, 2, 2],
+        num_attention_heads: list[int] = [1, 2, 5, 8],
+        mlp_ratios: list[int] = [8, 8, 4, 4],
+        hidden_act: Union[str, Callable] = "gelu",
+        hidden_dropout_prob: float = 0.0,
+        attention_probs_dropout_prob: float = 0.0,
+        initializer_range: float = 0.02,
+        drop_path_rate: float = 0.0,
+        layer_norm_eps: float = 1e-6,
+        qkv_bias: bool = True,
+        linear_attention: bool = False,
+        out_features=None,
+        out_indices=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        image_size = (image_size, image_size) if isinstance(image_size, int) else image_size
+
+        self.image_size = image_size
+        self.num_channels = num_channels
+        self.num_encoder_blocks = num_encoder_blocks
+        self.depths = depths
+        self.sr_ratios = sr_ratios
+        self.hidden_sizes = hidden_sizes
+        self.patch_sizes = patch_sizes
+        self.strides = strides
+        self.mlp_ratios = mlp_ratios
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.drop_path_rate = drop_path_rate
+        self.layer_norm_eps = layer_norm_eps
+        self.qkv_bias = qkv_bias
+        self.linear_attention = linear_attention
+        self.stage_names = [f"stage{idx}" for idx in range(1, len(depths) + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+
+
+__all__ = ["PvtV2Config"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/pvt_v2/modeling_pvt_v2.py b/venv/lib/python3.13/site-packages/transformers/models/pvt_v2/modeling_pvt_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..204198787e451482f31c4a7f0484347801e1a6d2
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/pvt_v2/modeling_pvt_v2.py
@@ -0,0 +1,614 @@
+# coding=utf-8
+# Copyright 2024 Authors: Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan,
+# Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch PVTv2 model."""
+
+import math
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BackboneOutput, BaseModelOutput, ImageClassifierOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import auto_docstring, logging
+from ...utils.backbone_utils import BackboneMixin
+from .configuration_pvt_v2 import PvtV2Config
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.convnext.modeling_convnext.ConvNextDropPath with ConvNext->Pvt
+class PvtV2DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return f"p={self.drop_prob}"
+
+
+class PvtV2OverlapPatchEmbeddings(nn.Module):
+    """Image to Patch Embedding"""
+
+    def __init__(self, config: PvtV2Config, layer_idx: int):
+        super().__init__()
+        patch_size = config.patch_sizes[layer_idx]
+        patch_size = (patch_size, patch_size) if isinstance(patch_size, int) else patch_size
+        stride = config.strides[layer_idx]
+        num_channels = config.num_channels if layer_idx == 0 else config.hidden_sizes[layer_idx - 1]
+        hidden_size = config.hidden_sizes[layer_idx]
+        self.patch_size = patch_size
+        self.proj = nn.Conv2d(
+            num_channels,
+            hidden_size,
+            kernel_size=patch_size,
+            stride=stride,
+            padding=(patch_size[0] // 2, patch_size[1] // 2),
+        )
+        self.layer_norm = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, pixel_values):
+        embeddings = self.proj(pixel_values)
+        _, _, height, width = embeddings.shape
+        embeddings = embeddings.flatten(2).transpose(1, 2)
+        embeddings = self.layer_norm(embeddings)
+        return embeddings, height, width
+
+
+class PvtV2DepthWiseConv(nn.Module):
+    """
+    Depth-wise (DW) convolution to infuse positional information using zero-padding. Depth-wise convolutions
+    have an equal number of groups to the number of input channels, meaning one filter per input channel. This
+    reduces the overall parameters and compute costs since the key purpose of this layer is position encoding.
+    """
+
+    def __init__(self, config: PvtV2Config, dim: int = 768):
+        super().__init__()
+        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
+
+    def forward(self, hidden_states, height, width):
+        batch_size, seq_len, num_channels = hidden_states.shape
+        hidden_states = hidden_states.transpose(1, 2).view(batch_size, num_channels, height, width)
+        hidden_states = self.dwconv(hidden_states)
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+
+        return hidden_states
+
+
+class PvtV2SelfAttention(nn.Module):
+    """Efficient self-attention mechanism."""
+
+    def __init__(self, config: PvtV2Config, hidden_size: int, num_attention_heads: int, spatial_reduction_ratio: int):
+        super().__init__()
+        self.linear_attention = config.linear_attention
+        self.pruned_heads = set()
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+
+        if self.hidden_size % self.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({self.num_attention_heads})"
+            )
+
+        self.attention_head_size = int(self.hidden_size / self.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(self.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(self.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(self.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.attn_drop = nn.Dropout(config.attention_probs_dropout_prob)
+        self.proj = nn.Linear(self.hidden_size, self.hidden_size)
+        self.proj_drop = nn.Dropout(config.hidden_dropout_prob)
+
+        self.spatial_reduction_ratio = spatial_reduction_ratio
+        if self.linear_attention:
+            self.pool = nn.AdaptiveAvgPool2d(7)
+            self.spatial_reduction = nn.Conv2d(self.hidden_size, self.hidden_size, kernel_size=1, stride=1)
+            self.layer_norm = nn.LayerNorm(self.hidden_size, eps=config.layer_norm_eps)
+            self.act = nn.GELU()
+        elif spatial_reduction_ratio > 1:
+            self.spatial_reduction = nn.Conv2d(
+                self.hidden_size, self.hidden_size, kernel_size=spatial_reduction_ratio, stride=spatial_reduction_ratio
+            )
+            self.layer_norm = nn.LayerNorm(self.hidden_size, eps=config.layer_norm_eps)
+
+    def transpose_for_scores(self, hidden_states) -> torch.Tensor:
+        new_shape = hidden_states.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        hidden_states = hidden_states.view(new_shape)
+        return hidden_states.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        height: int,
+        width: int,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor]:
+        batch_size, seq_len, num_channels = hidden_states.shape
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+        if self.linear_attention:
+            hidden_states = hidden_states.permute(0, 2, 1).reshape(batch_size, num_channels, height, width)
+            hidden_states = (
+                self.spatial_reduction(self.pool(hidden_states)).reshape(batch_size, num_channels, -1).permute(0, 2, 1)
+            )
+            hidden_states = self.act(self.layer_norm(hidden_states))
+        elif self.spatial_reduction_ratio > 1:
+            hidden_states = hidden_states.permute(0, 2, 1).reshape(batch_size, num_channels, height, width)
+            hidden_states = (
+                self.spatial_reduction(hidden_states).reshape(batch_size, num_channels, -1).permute(0, 2, 1)
+            )
+            hidden_states = self.layer_norm(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.attn_drop(attention_probs)
+        context_layer = (attention_probs @ value_layer).transpose(1, 2).reshape(batch_size, seq_len, num_channels)
+        context_layer = self.proj(context_layer)
+        context_layer = self.proj_drop(context_layer)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.num_attention_heads, self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.query = prune_linear_layer(self.query, index)
+        self.key = prune_linear_layer(self.key, index)
+        self.value = prune_linear_layer(self.value, index)
+        self.proj = prune_linear_layer(self.proj, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.num_attention_heads = self.num_attention_heads - len(heads)
+        self.all_head_size = self.attention_head_size * self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+
+class PvtV2ConvFeedForwardNetwork(nn.Module):
+    def __init__(
+        self,
+        config: PvtV2Config,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+    ):
+        super().__init__()
+        out_features = out_features if out_features is not None else in_features
+        self.dense1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = PvtV2DepthWiseConv(config, hidden_features)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+        self.dense2 = nn.Linear(hidden_features, out_features)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.relu = nn.ReLU() if config.linear_attention else nn.Identity()
+
+    def forward(self, hidden_states: torch.Tensor, height, width) -> torch.Tensor:
+        hidden_states = self.dense1(hidden_states)
+        hidden_states = self.relu(hidden_states)
+        hidden_states = self.dwconv(hidden_states, height, width)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense2(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class PvtV2BlockLayer(nn.Module):
+    def __init__(self, config: PvtV2Config, layer_idx: int, drop_path: float = 0.0):
+        super().__init__()
+        hidden_size: int = config.hidden_sizes[layer_idx]
+        num_attention_heads: int = config.num_attention_heads[layer_idx]
+        spatial_reduction_ratio: int = config.sr_ratios[layer_idx]
+        mlp_ratio: float = config.mlp_ratios[layer_idx]
+        self.layer_norm_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+        self.attention = PvtV2SelfAttention(
+            config=config,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            spatial_reduction_ratio=spatial_reduction_ratio,
+        )
+        self.drop_path = PvtV2DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.layer_norm_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+        mlp_hidden_size = int(hidden_size * mlp_ratio)
+        self.mlp = PvtV2ConvFeedForwardNetwork(config=config, in_features=hidden_size, hidden_features=mlp_hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor, height: int, width: int, output_attentions: bool = False):
+        self_attention_outputs = self.attention(
+            hidden_states=self.layer_norm_1(hidden_states),
+            height=height,
+            width=width,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]
+
+        attention_output = self.drop_path(attention_output)
+        hidden_states = attention_output + hidden_states
+
+        mlp_output = self.mlp(self.layer_norm_2(hidden_states), height, width)
+
+        mlp_output = self.drop_path(mlp_output)
+        layer_output = hidden_states + mlp_output
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+class PvtV2EncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: PvtV2Config, layer_idx: int):
+        super().__init__()
+        self.patch_embedding = PvtV2OverlapPatchEmbeddings(
+            config=config,
+            layer_idx=layer_idx,
+        )
+        # Transformer block
+        # stochastic depth decay rule
+        drop_path_decays = torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu").tolist()
+        block_layers = []
+        for block_idx in range(config.depths[layer_idx]):
+            block_layers.append(
+                PvtV2BlockLayer(
+                    config=config,
+                    layer_idx=layer_idx,
+                    drop_path=drop_path_decays[sum(config.depths[:layer_idx]) + block_idx],
+                )
+            )
+        self.blocks = nn.ModuleList(block_layers)
+
+        # Layer norm
+        self.layer_norm = nn.LayerNorm(config.hidden_sizes[layer_idx], eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states, output_attentions):
+        all_self_attentions = () if output_attentions else None
+        # first, obtain patch embeddings
+        hidden_states, height, width = self.patch_embedding(hidden_states)
+        # second, send embeddings through blocks
+        for block in self.blocks:
+            layer_outputs = block(hidden_states, height, width, output_attentions)
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions += (layer_outputs[1],)
+        # third, apply layer norm
+        hidden_states = self.layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (all_self_attentions,)
+
+        return outputs, height, width
+
+
+class PvtV2Encoder(nn.Module):
+    def __init__(self, config: PvtV2Config):
+        super().__init__()
+        self.config = config
+        self.gradient_checkpointing = False
+
+        # encoder layers
+        self.layers = nn.ModuleList([PvtV2EncoderLayer(config, i) for i in range(config.num_encoder_blocks)])
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        batch_size = pixel_values.shape[0]
+        hidden_states = pixel_values
+        for idx, layer in enumerate(self.layers):
+            layer_output = layer(hidden_states, output_attentions)
+            outputs, height, width = layer_output
+            hidden_states = outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[1],)
+            # reshape back to (batch_size, num_channels, height, width)
+            hidden_states = hidden_states.reshape(batch_size, height, width, -1).permute(0, 3, 1, 2).contiguous()
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+@auto_docstring
+class PvtV2PreTrainedModel(PreTrainedModel):
+    config: PvtV2Config
+    base_model_prefix = "pvt_v2"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(module.weight.data, mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv2d):
+            fan_out = module.kernel_size[0] * module.kernel_size[1] * module.out_channels
+            fan_out //= module.groups
+            module.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+
+@auto_docstring
+class PvtV2Model(PvtV2PreTrainedModel):
+    def __init__(self, config: PvtV2Config):
+        super().__init__(config)
+        self.config = config
+
+        # hierarchical Transformer encoder
+        self.encoder = PvtV2Encoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_outputs = self.encoder(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Pvt-v2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state
+    of the [CLS] token) e.g. for ImageNet.
+    """
+)
+class PvtV2ForImageClassification(PvtV2PreTrainedModel):
+    def __init__(self, config: PvtV2Config) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.pvt_v2 = PvtV2Model(config)
+
+        # Classifier head
+        self.classifier = (
+            nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor],
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.pvt_v2(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        # convert last hidden states to (batch_size, height*width, hidden_size)
+        batch_size = sequence_output.shape[0]
+        # (batch_size, num_channels, height, width) -> (batch_size, height, width, num_channels)
+        sequence_output = sequence_output.permute(0, 2, 3, 1)
+        sequence_output = sequence_output.reshape(batch_size, -1, self.config.hidden_sizes[-1])
+
+        # global average pooling
+        sequence_output = sequence_output.mean(dim=1)
+
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(labels, logits, self.config)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    PVTv2 backbone, to be used with frameworks like DETR and MaskFormer.
+    """
+)
+class PvtV2Backbone(PvtV2Model, BackboneMixin):
+    def __init__(self, config: PvtV2Config):
+        super().__init__(config)
+        super()._init_backbone(config)
+        self.num_features = config.hidden_sizes
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> BackboneOutput:
+        r"""
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("OpenGVLab/pvt_v2_b0")
+        >>> model = AutoBackbone.from_pretrained(
+        ...     "OpenGVLab/pvt_v2_b0", out_features=["stage1", "stage2", "stage3", "stage4"]
+        ... )
+
+        >>> inputs = processor(image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> feature_maps = outputs.feature_maps
+        >>> list(feature_maps[-1].shape)
+        [1, 256, 7, 7]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        outputs = self.encoder(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=True,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs.hidden_states
+
+        feature_maps = ()
+        for idx, stage in enumerate(self.stage_names):
+            if stage in self.out_features:
+                feature_maps += (hidden_states[idx],)
+
+        if not return_dict:
+            output = (feature_maps,)
+            if output_hidden_states:
+                output += (outputs.hidden_states,)
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=None,
+        )
+
+
+__all__ = ["PvtV2ForImageClassification", "PvtV2Model", "PvtV2PreTrainedModel", "PvtV2Backbone"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/qwen2_moe/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/qwen2_moe/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac210a3c10e6a27db4059169db0fa83332da2b16
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/qwen2_moe/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_qwen2_moe import *
+    from .modeling_qwen2_moe import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/qwen2_moe/configuration_qwen2_moe.py b/venv/lib/python3.13/site-packages/transformers/models/qwen2_moe/configuration_qwen2_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..19881fd1df6f752c2a0334e67c5487e77fddfc14
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/qwen2_moe/configuration_qwen2_moe.py
@@ -0,0 +1,244 @@
+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Qwen2MoE model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class Qwen2MoeConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen2MoeModel`]. It is used to instantiate a
+    Qwen2MoE model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of [Qwen/Qwen1.5-MoE-A2.7B](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the Qwen2MoE model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Qwen2MoeModel`]
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 5632):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 16):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+        max_window_layers (`int`, *optional*, defaults to 28):
+            The number of layers using full attention. The first `max_window_layers` layers will use full attention, while any
+            additional layer afterwards will use SWA (Sliding Window Attention).
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        decoder_sparse_step (`int`, *optional*, defaults to 1):
+            The frequency of the MoE layer.
+        moe_intermediate_size (`int`, *optional*, defaults to 1408):
+            Intermediate size of the routed expert.
+        shared_expert_intermediate_size (`int`, *optional*, defaults to 5632):
+            Intermediate size of the shared expert.
+        num_experts_per_tok (`int`, *optional*, defaults to 4):
+            Number of selected experts.
+        num_experts (`int`, *optional*, defaults to 60):
+            Number of routed experts.
+        norm_topk_prob (`bool`, *optional*, defaults to `False`):
+            Whether to normalize the topk probabilities.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabling this will also
+            allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+        mlp_only_layers (`list[int]`, *optional*, defaults to `[]`):
+            Indicate which layers use Qwen2MoeMLP rather than Qwen2MoeSparseMoeBlock
+            The list contains layer index, from 0 to num_layers-1 if we have num_layers layers
+            If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+    ```python
+    >>> from transformers import Qwen2MoeModel, Qwen2MoeConfig
+
+    >>> # Initializing a Qwen2MoE style configuration
+    >>> configuration = Qwen2MoeConfig()
+
+    >>> # Initializing a model from the Qwen1.5-MoE-A2.7B" style configuration
+    >>> model = Qwen2MoeModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen2_moe"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    # Default tensor parallel plan for base model `Qwen2Moe`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=2048,
+        intermediate_size=5632,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        num_key_value_heads=16,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=28,
+        attention_dropout=0.0,
+        decoder_sparse_step=1,
+        moe_intermediate_size=1408,
+        shared_expert_intermediate_size=5632,
+        num_experts_per_tok=4,
+        num_experts=60,
+        norm_topk_prob=False,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        mlp_only_layers=None,
+        qkv_bias=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window if use_sliding_window else None
+        self.max_window_layers = max_window_layers
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_dropout = attention_dropout
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+
+        # MoE arguments
+        self.decoder_sparse_step = decoder_sparse_step
+        self.moe_intermediate_size = moe_intermediate_size
+        self.shared_expert_intermediate_size = shared_expert_intermediate_size
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_experts = num_experts
+        self.norm_topk_prob = norm_topk_prob
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers
+        self.qkv_bias = qkv_bias
+
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["Qwen2MoeConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/venv/lib/python3.13/site-packages/transformers/models/qwen2_moe/modeling_qwen2_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4b37477d5af43988f8780ae70ee78b7cc70bb02
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/qwen2_moe/modeling_qwen2_moe.py
@@ -0,0 +1,1172 @@
+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Qwen2MoE model."""
+
+import math
+from typing import Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, StaticCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
+from ...modeling_layers import (
+    GenericForQuestionAnswering,
+    GenericForSequenceClassification,
+    GenericForTokenClassification,
+    GradientCheckpointingLayer,
+)
+from ...modeling_outputs import (
+    MoeCausalLMOutputWithPast,
+    MoeModelOutputWithPast,
+)
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import PreTrainedModel
+from ...utils import auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_qwen2_moe import Qwen2MoeConfig
+
+
+if is_flash_attn_available():
+    from ...modeling_flash_attention_utils import _flash_attention_forward
+
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+logger = logging.get_logger(__name__)
+
+
+def load_balancing_loss_func(
+    gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
+    num_experts: Optional[int] = None,
+    top_k=2,
+    attention_mask: Optional[torch.Tensor] = None,
+) -> Union[torch.Tensor, int]:
+    r"""
+    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
+
+    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
+    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
+    experts is too unbalanced.
+
+    Args:
+        gate_logits:
+            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
+            shape [batch_size X sequence_length, num_experts].
+        num_experts:
+            Number of experts
+        top_k:
+            The number of experts to route per-token, can be also interpreted as the `top-k` routing
+            parameter.
+        attention_mask (`torch.Tensor`, *optional*):
+            The attention_mask used in forward function
+            shape [batch_size X sequence_length] if not None.
+
+    Returns:
+        The auxiliary loss.
+    """
+    if gate_logits is None or not isinstance(gate_logits, tuple):
+        return 0
+
+    if isinstance(gate_logits, tuple):
+        compute_device = gate_logits[0].device
+        concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
+
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+
+    _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
+
+    expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
+
+    if attention_mask is None:
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.mean(routing_weights, dim=0)
+    else:
+        batch_size, sequence_length = attention_mask.shape
+        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
+        expert_attention_mask = (
+            attention_mask[None, :, :, None, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
+            .reshape(-1, top_k, num_experts)
+            .to(compute_device)
+        )
+
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
+            expert_attention_mask, dim=0
+        )
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
+        router_per_expert_attention_mask = (
+            attention_mask[None, :, :, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, routing_weights.shape[1]))
+            .reshape(-1, routing_weights.shape[1])
+            .to(compute_device)
+        )
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
+            router_per_expert_attention_mask, dim=0
+        )
+
+    device_index = routing_weights.device.index if routing_weights.device.index is not None else 0
+    rank = routing_weights.shape[1] * int(device_index)
+    overall_loss = torch.sum(
+        tokens_per_expert[:, rank : rank + routing_weights.shape[1]] * router_prob_per_expert.unsqueeze(0)
+    )
+    return overall_loss * num_experts
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2Moe
+class Qwen2MoeRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Qwen2MoeRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2Moe
+class Qwen2MoeRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: Qwen2MoeConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+# Modified from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2Moe
+class Qwen2MoeMLP(nn.Module):
+    def __init__(self, config, intermediate_size=None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+# copied from transformers.models.qwen2.modeling_qwen2.Qwen2Attention with Qwen2->Qwen2Moe
+# no longer copied after attention refactors
+class Qwen2MoeAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: Qwen2MoeConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=self.config.qkv_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=self.config.qkv_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=self.config.qkv_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+        self.rotary_emb = Qwen2MoeRotaryEmbedding(config=self.config)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_values is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights
+
+
+# NO LONGER EXIST Copied from transformers.models.qwen2.modeling_qwen2.Qwen2FlashAttention2 with Qwen2->Qwen2Moe
+# TODO cyril: modular
+class Qwen2MoeFlashAttention2(Qwen2MoeAttention):
+    """
+    Qwen2Moe flash attention module, following Qwen2Moe attention module. This module inherits from `Qwen2MoeAttention`
+    as the weights of the module stays untouched. The only required change would be on the forward pass
+    where it needs to correctly call the public API of flash attention and deal with padding tokens
+    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
+    config.max_window_layers layers.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+    ):
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        device_type = query_states.device.type if query_states.device.type != "mps" else "cpu"
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = (
+                    torch.get_autocast_dtype(device_type)
+                    if hasattr(torch, "get_autocast_dtype")
+                    else torch.get_autocast_gpu_dtype()
+                )
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        if (
+            self.config.use_sliding_window
+            and getattr(self.config, "sliding_window", None) is not None
+            and self.layer_idx >= self.config.max_window_layers
+        ):
+            sliding_window = self.config.sliding_window
+        else:
+            sliding_window = None
+
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            sliding_window=sliding_window,
+            is_causal=self.is_causal,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights
+
+
+# NO LONGER EXIST Copied from transformers.models.qwen2.modeling_qwen2.Qwen2SdpaAttention with Qwen2->Qwen2Moe
+# TODO cyril: modular
+class Qwen2MoeSdpaAttention(Qwen2MoeAttention):
+    """
+    Qwen2Moe attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `Qwen2MoeAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from Qwen2MoeAttention.forward
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "Qwen2MoeModel is using Qwen2MoeSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        is_causal = bool(causal_mask is None and q_len > 1)
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None
+
+
+QWEN2MOE_ATTENTION_CLASSES = {
+    "eager": Qwen2MoeAttention,
+    "flash_attention_2": Qwen2MoeFlashAttention2,
+    "sdpa": Qwen2MoeSdpaAttention,
+}
+
+
+class Qwen2MoeSparseMoeBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.num_experts = config.num_experts
+        self.top_k = config.num_experts_per_tok
+        self.norm_topk_prob = config.norm_topk_prob
+
+        # gating
+        self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False)
+        self.experts = nn.ModuleList(
+            [Qwen2MoeMLP(config, intermediate_size=config.moe_intermediate_size) for _ in range(self.num_experts)]
+        )
+
+        self.shared_expert = Qwen2MoeMLP(config, intermediate_size=config.shared_expert_intermediate_size)
+        self.shared_expert_gate = torch.nn.Linear(config.hidden_size, 1, bias=False)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """ """
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.gate(hidden_states)
+
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
+        if self.norm_topk_prob:
+            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        # we cast back to the input dtype
+        routing_weights = routing_weights.to(hidden_states.dtype)
+
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
+        )
+
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
+
+        # Loop over all available experts in the model and perform the computation on each expert
+        expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+        for expert_idx in expert_hit:
+            expert_layer = self.experts[expert_idx]
+            idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))
+
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
+            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
+
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
+
+        shared_expert_output = self.shared_expert(hidden_states)
+        shared_expert_output = F.sigmoid(self.shared_expert_gate(hidden_states)) * shared_expert_output
+
+        final_hidden_states = final_hidden_states + shared_expert_output
+
+        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
+        return final_hidden_states, router_logits
+
+
+class Qwen2MoeDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Qwen2MoeConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = QWEN2MOE_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+
+        if (layer_idx not in config.mlp_only_layers) and (
+            config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0
+        ):
+            self.mlp = Qwen2MoeSparseMoeBlock(config)
+        else:
+            self.mlp = Qwen2MoeMLP(config, intermediate_size=config.intermediate_size)
+
+        self.input_layernorm = Qwen2MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen2MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        output_router_logits: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_router_logits (`bool`, *optional*):
+                Whether or not to return the logits of all the routers. They are useful for computing the router loss,
+                and should not be returned during inference.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_values (`Cache`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+
+        hidden_states = self.mlp(hidden_states)
+        if isinstance(hidden_states, tuple):
+            hidden_states, router_logits = hidden_states
+        else:
+            router_logits = None
+
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if output_router_logits:
+            outputs += (router_logits,)
+
+        return outputs
+
+
+@auto_docstring
+class Qwen2MoePreTrainedModel(PreTrainedModel):
+    config: Qwen2MoeConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen2MoeDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
+    _supports_sdpa = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, Qwen2MoeRMSNorm):
+            module.weight.data.fill_(1.0)
+
+
+@auto_docstring
+class Qwen2MoeModel(Qwen2MoePreTrainedModel):
+    def __init__(self, config: Qwen2MoeConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Qwen2MoeDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._attn_implementation = config._attn_implementation
+        self.norm = Qwen2MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Qwen2MoeRotaryEmbedding(config=config)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> MoeModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_router_logits = () if output_router_logits else None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                output_router_logits=output_router_logits,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+            if output_router_logits and layer_outputs[-1] is not None:
+                all_router_logits += (layer_outputs[-1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        return MoeModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            router_logits=all_router_logits,
+        )
+
+    # Copied from transformers.models.phimoe.modeling_phimoe.PhimoeModel._update_causal_mask with Phimoe->Qwen2Moe
+    def _update_causal_mask(
+        self,
+        attention_mask: Union[torch.Tensor, "BlockMask"],
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool = False,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and past_key_values is not None:
+                is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
+                if is_padding_right:
+                    raise ValueError(
+                        "You are attempting to perform batched generation with padding_side='right'"
+                        " this may lead to unexpected behaviour for Flash Attention version of Qwen2Moe. Make sure to "
+                        " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                    )
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            return attention_mask
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                sliding_window=self.config.sliding_window,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype = input_tensor.dtype
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        # StaticCache
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        # DynamicCache or no cache
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+            config=self.config,
+            past_key_values=past_key_values,
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    # Copied from transformers.models.phimoe.modeling_phimoe.PhimoeModel._prepare_4d_causal_attention_mask_with_cache_position with Phimoe->Qwen2Moe
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        config: Qwen2MoeConfig,
+        past_key_values: Cache,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+            config (`Qwen2MoeConfig`):
+                The model's configuration class
+            past_key_values (`Cache`):
+                The cache class that is being used currently to generate
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
+            )
+            diagonal_attend_mask = torch.arange(target_length, device=cache_position.device) > cache_position.reshape(
+                -1, 1
+            )
+            text_config = config.get_text_config()
+            if getattr(text_config, "use_sliding_window", True) and text_config.sliding_window is not None:
+                # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
+                # the check is needed to verify is current checkpoint was trained with sliding window or not
+                is_static_sliding_cache = isinstance(past_key_values, StaticCache) and all(past_key_values.is_sliding)
+                if not is_static_sliding_cache or sequence_length > target_length:
+                    sliding_attend_mask = torch.arange(target_length, device=cache_position.device) <= (
+                        cache_position.reshape(-1, 1) - text_config.sliding_window
+                    )
+                    diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
+            causal_mask *= diagonal_attend_mask
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                if attention_mask.shape[-1] > target_length:
+                    attention_mask = attention_mask[:, :target_length]
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+        return causal_mask
+
+
+class Qwen2MoeForCausalLM(Qwen2MoePreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Qwen2MoeModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.router_aux_loss_coef = config.router_aux_loss_coef
+        self.num_experts = config.num_experts
+        self.num_experts_per_tok = config.num_experts_per_tok
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs,
+    ) -> MoeCausalLMOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Qwen2MoeForCausalLM
+
+        >>> model = Qwen2MoeForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: MoeModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_router_logits=output_router_logits,
+            cache_position=cache_position,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
+
+        aux_loss = None
+        if output_router_logits:
+            aux_loss = load_balancing_loss_func(
+                outputs.router_logits,
+                self.num_experts,
+                self.num_experts_per_tok,
+                attention_mask,
+            )
+            if labels is not None:
+                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
+
+        return MoeCausalLMOutputWithPast(
+            loss=loss,
+            aux_loss=aux_loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            router_logits=outputs.router_logits,
+        )
+
+
+class Qwen2MoeForSequenceClassification(GenericForSequenceClassification, Qwen2MoePreTrainedModel): ...
+
+
+class Qwen2MoeForTokenClassification(GenericForTokenClassification, Qwen2MoePreTrainedModel): ...
+
+
+class Qwen2MoeForQuestionAnswering(GenericForQuestionAnswering, Qwen2MoePreTrainedModel): ...
+
+
+__all__ = [
+    "Qwen2MoeForCausalLM",
+    "Qwen2MoeForQuestionAnswering",
+    "Qwen2MoeModel",
+    "Qwen2MoePreTrainedModel",
+    "Qwen2MoeForSequenceClassification",
+    "Qwen2MoeForTokenClassification",
+]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/regnet/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/regnet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cac770fdd0bcf669b4f723dbbd88afcbc098cd12
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/regnet/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_regnet import *
+    from .modeling_flax_regnet import *
+    from .modeling_regnet import *
+    from .modeling_tf_regnet import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/regnet/configuration_regnet.py b/venv/lib/python3.13/site-packages/transformers/models/regnet/configuration_regnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..4858bd1e4616f87423d639a688dc1fd1768f6c63
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/regnet/configuration_regnet.py
@@ -0,0 +1,94 @@
+# coding=utf-8
+# Copyright 2022 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""RegNet model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class RegNetConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`RegNetModel`]. It is used to instantiate a RegNet
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the RegNet
+    [facebook/regnet-y-040](https://huggingface.co/facebook/regnet-y-040) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        embedding_size (`int`, *optional*, defaults to 64):
+            Dimensionality (hidden size) for the embedding layer.
+        hidden_sizes (`list[int]`, *optional*, defaults to `[256, 512, 1024, 2048]`):
+            Dimensionality (hidden size) at each stage.
+        depths (`list[int]`, *optional*, defaults to `[3, 4, 6, 3]`):
+            Depth (number of layers) for each stage.
+        layer_type (`str`, *optional*, defaults to `"y"`):
+            The layer to use, it can be either `"x" or `"y"`. An `x` layer is a ResNet's BottleNeck layer with
+            `reduction` fixed to `1`. While a `y` layer is a `x` but with squeeze and excitation. Please refer to the
+            paper for a detailed explanation of how these layers were constructed.
+        hidden_act (`str`, *optional*, defaults to `"relu"`):
+            The non-linear activation function in each block. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"`
+            are supported.
+        downsample_in_first_stage (`bool`, *optional*, defaults to `False`):
+            If `True`, the first stage will downsample the inputs using a `stride` of 2.
+
+    Example:
+    ```python
+    >>> from transformers import RegNetConfig, RegNetModel
+
+    >>> # Initializing a RegNet regnet-y-40 style configuration
+    >>> configuration = RegNetConfig()
+    >>> # Initializing a model from the regnet-y-40 style configuration
+    >>> model = RegNetModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "regnet"
+    layer_types = ["x", "y"]
+
+    def __init__(
+        self,
+        num_channels=3,
+        embedding_size=32,
+        hidden_sizes=[128, 192, 512, 1088],
+        depths=[2, 6, 12, 2],
+        groups_width=64,
+        layer_type="y",
+        hidden_act="relu",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if layer_type not in self.layer_types:
+            raise ValueError(f"layer_type={layer_type} is not one of {','.join(self.layer_types)}")
+        self.num_channels = num_channels
+        self.embedding_size = embedding_size
+        self.hidden_sizes = hidden_sizes
+        self.depths = depths
+        self.groups_width = groups_width
+        self.layer_type = layer_type
+        self.hidden_act = hidden_act
+        # always downsample in the first stage
+        self.downsample_in_first_stage = True
+
+
+__all__ = ["RegNetConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/regnet/modeling_flax_regnet.py b/venv/lib/python3.13/site-packages/transformers/models/regnet/modeling_flax_regnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cc3707fa51a3dc7a36b3e673a99d9d946cd19b5
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/regnet/modeling_flax_regnet.py
@@ -0,0 +1,822 @@
+# coding=utf-8
+# Copyright 2023 The Google Flax Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from functools import partial
+from typing import Optional
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.traverse_util import flatten_dict, unflatten_dict
+
+from transformers import RegNetConfig
+from transformers.modeling_flax_outputs import (
+    FlaxBaseModelOutputWithNoAttention,
+    FlaxBaseModelOutputWithPooling,
+    FlaxBaseModelOutputWithPoolingAndNoAttention,
+    FlaxImageClassifierOutputWithNoAttention,
+)
+from transformers.modeling_flax_utils import (
+    ACT2FN,
+    FlaxPreTrainedModel,
+    append_replace_return_docstrings,
+    overwrite_call_docstring,
+)
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+)
+
+
+REGNET_START_DOCSTRING = r"""
+
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)
+
+    This model is also a
+    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
+    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
+    behavior.
+
+    Finally, this model supports inherent JAX features such as:
+
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        config ([`RegNetConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs).
+
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
+"""
+
+REGNET_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`RegNetImageProcessor.__call__`] for details.
+
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.resnet.modeling_flax_resnet.Identity
+class Identity(nn.Module):
+    """Identity function."""
+
+    @nn.compact
+    def __call__(self, x, **kwargs):
+        return x
+
+
+class FlaxRegNetConvLayer(nn.Module):
+    out_channels: int
+    kernel_size: int = 3
+    stride: int = 1
+    groups: int = 1
+    activation: Optional[str] = "relu"
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.convolution = nn.Conv(
+            self.out_channels,
+            kernel_size=(self.kernel_size, self.kernel_size),
+            strides=self.stride,
+            padding=self.kernel_size // 2,
+            feature_group_count=self.groups,
+            use_bias=False,
+            kernel_init=nn.initializers.variance_scaling(2.0, mode="fan_out", distribution="truncated_normal"),
+            dtype=self.dtype,
+        )
+        self.normalization = nn.BatchNorm(momentum=0.9, epsilon=1e-05, dtype=self.dtype)
+        self.activation_func = ACT2FN[self.activation] if self.activation is not None else Identity()
+
+    def __call__(self, hidden_state: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
+        hidden_state = self.convolution(hidden_state)
+        hidden_state = self.normalization(hidden_state, use_running_average=deterministic)
+        hidden_state = self.activation_func(hidden_state)
+        return hidden_state
+
+
+class FlaxRegNetEmbeddings(nn.Module):
+    config: RegNetConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.embedder = FlaxRegNetConvLayer(
+            self.config.embedding_size,
+            kernel_size=3,
+            stride=2,
+            activation=self.config.hidden_act,
+            dtype=self.dtype,
+        )
+
+    def __call__(self, pixel_values: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
+        num_channels = pixel_values.shape[-1]
+        if num_channels != self.config.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        hidden_state = self.embedder(pixel_values, deterministic=deterministic)
+        return hidden_state
+
+
+# Copied from transformers.models.resnet.modeling_flax_resnet.FlaxResNetShortCut with ResNet->RegNet
+class FlaxRegNetShortCut(nn.Module):
+    """
+    RegNet shortcut, used to project the residual features to the correct size. If needed, it is also used to
+    downsample the input using `stride=2`.
+    """
+
+    out_channels: int
+    stride: int = 2
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.convolution = nn.Conv(
+            self.out_channels,
+            kernel_size=(1, 1),
+            strides=self.stride,
+            use_bias=False,
+            kernel_init=nn.initializers.variance_scaling(2.0, mode="fan_out", distribution="truncated_normal"),
+            dtype=self.dtype,
+        )
+        self.normalization = nn.BatchNorm(momentum=0.9, epsilon=1e-05, dtype=self.dtype)
+
+    def __call__(self, x: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
+        hidden_state = self.convolution(x)
+        hidden_state = self.normalization(hidden_state, use_running_average=deterministic)
+        return hidden_state
+
+
+class FlaxRegNetSELayerCollection(nn.Module):
+    in_channels: int
+    reduced_channels: int
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.conv_1 = nn.Conv(
+            self.reduced_channels,
+            kernel_size=(1, 1),
+            kernel_init=nn.initializers.variance_scaling(2.0, mode="fan_out", distribution="truncated_normal"),
+            dtype=self.dtype,
+            name="0",
+        )  # 0 is the name used in corresponding pytorch implementation
+        self.conv_2 = nn.Conv(
+            self.in_channels,
+            kernel_size=(1, 1),
+            kernel_init=nn.initializers.variance_scaling(2.0, mode="fan_out", distribution="truncated_normal"),
+            dtype=self.dtype,
+            name="2",
+        )  # 2 is the name used in corresponding pytorch implementation
+
+    def __call__(self, hidden_state: jnp.ndarray) -> jnp.ndarray:
+        hidden_state = self.conv_1(hidden_state)
+        hidden_state = nn.relu(hidden_state)
+        hidden_state = self.conv_2(hidden_state)
+        attention = nn.sigmoid(hidden_state)
+
+        return attention
+
+
+class FlaxRegNetSELayer(nn.Module):
+    """
+    Squeeze and Excitation layer (SE) proposed in [Squeeze-and-Excitation Networks](https://huggingface.co/papers/1709.01507).
+    """
+
+    in_channels: int
+    reduced_channels: int
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.pooler = partial(nn.avg_pool, padding=((0, 0), (0, 0)))
+        self.attention = FlaxRegNetSELayerCollection(self.in_channels, self.reduced_channels, dtype=self.dtype)
+
+    def __call__(self, hidden_state: jnp.ndarray) -> jnp.ndarray:
+        pooled = self.pooler(
+            hidden_state,
+            window_shape=(hidden_state.shape[1], hidden_state.shape[2]),
+            strides=(hidden_state.shape[1], hidden_state.shape[2]),
+        )
+        attention = self.attention(pooled)
+        hidden_state = hidden_state * attention
+        return hidden_state
+
+
+class FlaxRegNetXLayerCollection(nn.Module):
+    config: RegNetConfig
+    out_channels: int
+    stride: int = 1
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        groups = max(1, self.out_channels // self.config.groups_width)
+
+        self.layer = [
+            FlaxRegNetConvLayer(
+                self.out_channels,
+                kernel_size=1,
+                activation=self.config.hidden_act,
+                dtype=self.dtype,
+                name="0",
+            ),
+            FlaxRegNetConvLayer(
+                self.out_channels,
+                stride=self.stride,
+                groups=groups,
+                activation=self.config.hidden_act,
+                dtype=self.dtype,
+                name="1",
+            ),
+            FlaxRegNetConvLayer(
+                self.out_channels,
+                kernel_size=1,
+                activation=None,
+                dtype=self.dtype,
+                name="2",
+            ),
+        ]
+
+    def __call__(self, hidden_state: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
+        for layer in self.layer:
+            hidden_state = layer(hidden_state, deterministic=deterministic)
+        return hidden_state
+
+
+class FlaxRegNetXLayer(nn.Module):
+    """
+    RegNet's layer composed by three `3x3` convolutions, same as a ResNet bottleneck layer with reduction = 1.
+    """
+
+    config: RegNetConfig
+    in_channels: int
+    out_channels: int
+    stride: int = 1
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        should_apply_shortcut = self.in_channels != self.out_channels or self.stride != 1
+        self.shortcut = (
+            FlaxRegNetShortCut(
+                self.out_channels,
+                stride=self.stride,
+                dtype=self.dtype,
+            )
+            if should_apply_shortcut
+            else Identity()
+        )
+        self.layer = FlaxRegNetXLayerCollection(
+            self.config,
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,
+            stride=self.stride,
+            dtype=self.dtype,
+        )
+        self.activation_func = ACT2FN[self.config.hidden_act]
+
+    def __call__(self, hidden_state: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
+        residual = hidden_state
+        hidden_state = self.layer(hidden_state)
+        residual = self.shortcut(residual, deterministic=deterministic)
+        hidden_state += residual
+        hidden_state = self.activation_func(hidden_state)
+        return hidden_state
+
+
+class FlaxRegNetYLayerCollection(nn.Module):
+    config: RegNetConfig
+    in_channels: int
+    out_channels: int
+    stride: int = 1
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        groups = max(1, self.out_channels // self.config.groups_width)
+
+        self.layer = [
+            FlaxRegNetConvLayer(
+                self.out_channels,
+                kernel_size=1,
+                activation=self.config.hidden_act,
+                dtype=self.dtype,
+                name="0",
+            ),
+            FlaxRegNetConvLayer(
+                self.out_channels,
+                stride=self.stride,
+                groups=groups,
+                activation=self.config.hidden_act,
+                dtype=self.dtype,
+                name="1",
+            ),
+            FlaxRegNetSELayer(
+                self.out_channels,
+                reduced_channels=int(round(self.in_channels / 4)),
+                dtype=self.dtype,
+                name="2",
+            ),
+            FlaxRegNetConvLayer(
+                self.out_channels,
+                kernel_size=1,
+                activation=None,
+                dtype=self.dtype,
+                name="3",
+            ),
+        ]
+
+    def __call__(self, hidden_state: jnp.ndarray) -> jnp.ndarray:
+        for layer in self.layer:
+            hidden_state = layer(hidden_state)
+        return hidden_state
+
+
+class FlaxRegNetYLayer(nn.Module):
+    """
+    RegNet's Y layer: an X layer with Squeeze and Excitation.
+    """
+
+    config: RegNetConfig
+    in_channels: int
+    out_channels: int
+    stride: int = 1
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        should_apply_shortcut = self.in_channels != self.out_channels or self.stride != 1
+
+        self.shortcut = (
+            FlaxRegNetShortCut(
+                self.out_channels,
+                stride=self.stride,
+                dtype=self.dtype,
+            )
+            if should_apply_shortcut
+            else Identity()
+        )
+        self.layer = FlaxRegNetYLayerCollection(
+            self.config,
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,
+            stride=self.stride,
+            dtype=self.dtype,
+        )
+        self.activation_func = ACT2FN[self.config.hidden_act]
+
+    def __call__(self, hidden_state: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
+        residual = hidden_state
+        hidden_state = self.layer(hidden_state)
+        residual = self.shortcut(residual, deterministic=deterministic)
+        hidden_state += residual
+        hidden_state = self.activation_func(hidden_state)
+        return hidden_state
+
+
+class FlaxRegNetStageLayersCollection(nn.Module):
+    """
+    A RegNet stage composed by stacked layers.
+    """
+
+    config: RegNetConfig
+    in_channels: int
+    out_channels: int
+    stride: int = 2
+    depth: int = 2
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        layer = FlaxRegNetXLayer if self.config.layer_type == "x" else FlaxRegNetYLayer
+
+        layers = [
+            # downsampling is done in the first layer with stride of 2
+            layer(
+                self.config,
+                self.in_channels,
+                self.out_channels,
+                stride=self.stride,
+                dtype=self.dtype,
+                name="0",
+            )
+        ]
+
+        for i in range(self.depth - 1):
+            layers.append(
+                layer(
+                    self.config,
+                    self.out_channels,
+                    self.out_channels,
+                    dtype=self.dtype,
+                    name=str(i + 1),
+                )
+            )
+
+        self.layers = layers
+
+    def __call__(self, x: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
+        hidden_state = x
+        for layer in self.layers:
+            hidden_state = layer(hidden_state, deterministic=deterministic)
+        return hidden_state
+
+
+# Copied from transformers.models.resnet.modeling_flax_resnet.FlaxResNetStage with ResNet->RegNet
+class FlaxRegNetStage(nn.Module):
+    """
+    A RegNet stage composed by stacked layers.
+    """
+
+    config: RegNetConfig
+    in_channels: int
+    out_channels: int
+    stride: int = 2
+    depth: int = 2
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.layers = FlaxRegNetStageLayersCollection(
+            self.config,
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,
+            stride=self.stride,
+            depth=self.depth,
+            dtype=self.dtype,
+        )
+
+    def __call__(self, x: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
+        return self.layers(x, deterministic=deterministic)
+
+
+# Copied from transformers.models.resnet.modeling_flax_resnet.FlaxResNetStageCollection with ResNet->RegNet
+class FlaxRegNetStageCollection(nn.Module):
+    config: RegNetConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        in_out_channels = zip(self.config.hidden_sizes, self.config.hidden_sizes[1:])
+        stages = [
+            FlaxRegNetStage(
+                self.config,
+                self.config.embedding_size,
+                self.config.hidden_sizes[0],
+                stride=2 if self.config.downsample_in_first_stage else 1,
+                depth=self.config.depths[0],
+                dtype=self.dtype,
+                name="0",
+            )
+        ]
+
+        for i, ((in_channels, out_channels), depth) in enumerate(zip(in_out_channels, self.config.depths[1:])):
+            stages.append(
+                FlaxRegNetStage(self.config, in_channels, out_channels, depth=depth, dtype=self.dtype, name=str(i + 1))
+            )
+
+        self.stages = stages
+
+    def __call__(
+        self,
+        hidden_state: jnp.ndarray,
+        output_hidden_states: bool = False,
+        deterministic: bool = True,
+    ) -> FlaxBaseModelOutputWithNoAttention:
+        hidden_states = () if output_hidden_states else None
+
+        for stage_module in self.stages:
+            if output_hidden_states:
+                hidden_states = hidden_states + (hidden_state.transpose(0, 3, 1, 2),)
+
+            hidden_state = stage_module(hidden_state, deterministic=deterministic)
+
+        return hidden_state, hidden_states
+
+
+# Copied from transformers.models.resnet.modeling_flax_resnet.FlaxResNetEncoder with ResNet->RegNet
+class FlaxRegNetEncoder(nn.Module):
+    config: RegNetConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.stages = FlaxRegNetStageCollection(self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_state: jnp.ndarray,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ) -> FlaxBaseModelOutputWithNoAttention:
+        hidden_state, hidden_states = self.stages(
+            hidden_state, output_hidden_states=output_hidden_states, deterministic=deterministic
+        )
+
+        if output_hidden_states:
+            hidden_states = hidden_states + (hidden_state.transpose(0, 3, 1, 2),)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_state, hidden_states] if v is not None)
+
+        return FlaxBaseModelOutputWithNoAttention(
+            last_hidden_state=hidden_state,
+            hidden_states=hidden_states,
+        )
+
+
+# Copied from transformers.models.resnet.modeling_flax_resnet.FlaxResNetPreTrainedModel with ResNet->RegNet,resnet->regnet,RESNET->REGNET
+class FlaxRegNetPreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = RegNetConfig
+    base_model_prefix = "regnet"
+    main_input_name = "pixel_values"
+    module_class: nn.Module = None
+
+    def __init__(
+        self,
+        config: RegNetConfig,
+        input_shape=(1, 224, 224, 3),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        if input_shape is None:
+            input_shape = (1, config.image_size, config.image_size, config.num_channels)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        pixel_values = jnp.zeros(input_shape, dtype=self.dtype)
+
+        rngs = {"params": rng}
+
+        random_params = self.module.init(rngs, pixel_values, return_dict=False)
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    @add_start_docstrings_to_model_forward(REGNET_INPUTS_DOCSTRING)
+    def __call__(
+        self,
+        pixel_values,
+        params: Optional[dict] = None,
+        train: bool = False,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        pixel_values = jnp.transpose(pixel_values, (0, 2, 3, 1))
+
+        # Handle any PRNG if needed
+        rngs = {}
+
+        return self.module.apply(
+            {
+                "params": params["params"] if params is not None else self.params["params"],
+                "batch_stats": params["batch_stats"] if params is not None else self.params["batch_stats"],
+            },
+            jnp.array(pixel_values, dtype=jnp.float32),
+            not train,
+            output_hidden_states,
+            return_dict,
+            rngs=rngs,
+            mutable=["batch_stats"] if train else False,  # Returning tuple with batch_stats only when train is True
+        )
+
+
+# Copied from transformers.models.resnet.modeling_flax_resnet.FlaxResNetModule with ResNet->RegNet
+class FlaxRegNetModule(nn.Module):
+    config: RegNetConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.embedder = FlaxRegNetEmbeddings(self.config, dtype=self.dtype)
+        self.encoder = FlaxRegNetEncoder(self.config, dtype=self.dtype)
+
+        # Adaptive average pooling used in resnet
+        self.pooler = partial(
+            nn.avg_pool,
+            padding=((0, 0), (0, 0)),
+        )
+
+    def __call__(
+        self,
+        pixel_values,
+        deterministic: bool = True,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> FlaxBaseModelOutputWithPoolingAndNoAttention:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        embedding_output = self.embedder(pixel_values, deterministic=deterministic)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+
+        pooled_output = self.pooler(
+            last_hidden_state,
+            window_shape=(last_hidden_state.shape[1], last_hidden_state.shape[2]),
+            strides=(last_hidden_state.shape[1], last_hidden_state.shape[2]),
+        ).transpose(0, 3, 1, 2)
+
+        last_hidden_state = last_hidden_state.transpose(0, 3, 1, 2)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return FlaxBaseModelOutputWithPoolingAndNoAttention(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+@add_start_docstrings(
+    "The bare RegNet model outputting raw features without any specific head on top.",
+    REGNET_START_DOCSTRING,
+)
+class FlaxRegNetModel(FlaxRegNetPreTrainedModel):
+    module_class = FlaxRegNetModule
+
+
+FLAX_VISION_MODEL_DOCSTRING = """
+    Returns:
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoImageProcessor, FlaxRegNetModel
+    >>> from PIL import Image
+    >>> import requests
+
+    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    >>> image = Image.open(requests.get(url, stream=True).raw)
+
+    >>> image_processor = AutoImageProcessor.from_pretrained("facebook/regnet-y-040")
+    >>> model = FlaxRegNetModel.from_pretrained("facebook/regnet-y-040")
+
+    >>> inputs = image_processor(images=image, return_tensors="np")
+    >>> outputs = model(**inputs)
+    >>> last_hidden_states = outputs.last_hidden_state
+    ```
+"""
+
+overwrite_call_docstring(FlaxRegNetModel, FLAX_VISION_MODEL_DOCSTRING)
+append_replace_return_docstrings(
+    FlaxRegNetModel,
+    output_type=FlaxBaseModelOutputWithPooling,
+    config_class=RegNetConfig,
+)
+
+
+# Copied from transformers.models.resnet.modeling_flax_resnet.FlaxResNetClassifierCollection with ResNet->RegNet
+class FlaxRegNetClassifierCollection(nn.Module):
+    config: RegNetConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype, name="1")
+
+    def __call__(self, x: jnp.ndarray) -> jnp.ndarray:
+        return self.classifier(x)
+
+
+# Copied from transformers.models.resnet.modeling_flax_resnet.FlaxResNetForImageClassificationModule with ResNet->RegNet,resnet->regnet,RESNET->REGNET
+class FlaxRegNetForImageClassificationModule(nn.Module):
+    config: RegNetConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.regnet = FlaxRegNetModule(config=self.config, dtype=self.dtype)
+
+        if self.config.num_labels > 0:
+            self.classifier = FlaxRegNetClassifierCollection(self.config, dtype=self.dtype)
+        else:
+            self.classifier = Identity()
+
+    def __call__(
+        self,
+        pixel_values=None,
+        deterministic: bool = True,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.regnet(
+            pixel_values,
+            deterministic=deterministic,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs.pooler_output if return_dict else outputs[1]
+
+        logits = self.classifier(pooled_output[:, :, 0, 0])
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return output
+
+        return FlaxImageClassifierOutputWithNoAttention(logits=logits, hidden_states=outputs.hidden_states)
+
+
+@add_start_docstrings(
+    """
+    RegNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
+    ImageNet.
+    """,
+    REGNET_START_DOCSTRING,
+)
+class FlaxRegNetForImageClassification(FlaxRegNetPreTrainedModel):
+    module_class = FlaxRegNetForImageClassificationModule
+
+
+FLAX_VISION_CLASSIF_DOCSTRING = """
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoImageProcessor, FlaxRegNetForImageClassification
+    >>> from PIL import Image
+    >>> import jax
+    >>> import requests
+
+    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    >>> image = Image.open(requests.get(url, stream=True).raw)
+
+    >>> image_processor = AutoImageProcessor.from_pretrained("facebook/regnet-y-040")
+    >>> model = FlaxRegNetForImageClassification.from_pretrained("facebook/regnet-y-040")
+
+    >>> inputs = image_processor(images=image, return_tensors="np")
+    >>> outputs = model(**inputs)
+    >>> logits = outputs.logits
+
+    >>> # model predicts one of the 1000 ImageNet classes
+    >>> predicted_class_idx = jax.numpy.argmax(logits, axis=-1)
+    >>> print("Predicted class:", model.config.id2label[predicted_class_idx.item()])
+    ```
+"""
+
+overwrite_call_docstring(FlaxRegNetForImageClassification, FLAX_VISION_CLASSIF_DOCSTRING)
+append_replace_return_docstrings(
+    FlaxRegNetForImageClassification,
+    output_type=FlaxImageClassifierOutputWithNoAttention,
+    config_class=RegNetConfig,
+)
+
+
+__all__ = ["FlaxRegNetForImageClassification", "FlaxRegNetModel", "FlaxRegNetPreTrainedModel"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/regnet/modeling_regnet.py b/venv/lib/python3.13/site-packages/transformers/models/regnet/modeling_regnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..70611113885ff29083327c4d567f3bb27cf05402
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/regnet/modeling_regnet.py
@@ -0,0 +1,376 @@
+# coding=utf-8
+# Copyright 2022 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch RegNet model."""
+
+import math
+from typing import Optional
+
+import torch
+from torch import Tensor, nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutputWithNoAttention,
+    BaseModelOutputWithPoolingAndNoAttention,
+    ImageClassifierOutputWithNoAttention,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import auto_docstring, logging
+from .configuration_regnet import RegNetConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class RegNetConvLayer(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        stride: int = 1,
+        groups: int = 1,
+        activation: Optional[str] = "relu",
+    ):
+        super().__init__()
+        self.convolution = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=kernel_size // 2,
+            groups=groups,
+            bias=False,
+        )
+        self.normalization = nn.BatchNorm2d(out_channels)
+        self.activation = ACT2FN[activation] if activation is not None else nn.Identity()
+
+    def forward(self, hidden_state):
+        hidden_state = self.convolution(hidden_state)
+        hidden_state = self.normalization(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        return hidden_state
+
+
+class RegNetEmbeddings(nn.Module):
+    """
+    RegNet Embeddings (stem) composed of a single aggressive convolution.
+    """
+
+    def __init__(self, config: RegNetConfig):
+        super().__init__()
+        self.embedder = RegNetConvLayer(
+            config.num_channels, config.embedding_size, kernel_size=3, stride=2, activation=config.hidden_act
+        )
+        self.num_channels = config.num_channels
+
+    def forward(self, pixel_values):
+        num_channels = pixel_values.shape[1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        hidden_state = self.embedder(pixel_values)
+        return hidden_state
+
+
+# Copied from transformers.models.resnet.modeling_resnet.ResNetShortCut with ResNet->RegNet
+class RegNetShortCut(nn.Module):
+    """
+    RegNet shortcut, used to project the residual features to the correct size. If needed, it is also used to
+    downsample the input using `stride=2`.
+    """
+
+    def __init__(self, in_channels: int, out_channels: int, stride: int = 2):
+        super().__init__()
+        self.convolution = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False)
+        self.normalization = nn.BatchNorm2d(out_channels)
+
+    def forward(self, input: Tensor) -> Tensor:
+        hidden_state = self.convolution(input)
+        hidden_state = self.normalization(hidden_state)
+        return hidden_state
+
+
+class RegNetSELayer(nn.Module):
+    """
+    Squeeze and Excitation layer (SE) proposed in [Squeeze-and-Excitation Networks](https://huggingface.co/papers/1709.01507).
+    """
+
+    def __init__(self, in_channels: int, reduced_channels: int):
+        super().__init__()
+
+        self.pooler = nn.AdaptiveAvgPool2d((1, 1))
+        self.attention = nn.Sequential(
+            nn.Conv2d(in_channels, reduced_channels, kernel_size=1),
+            nn.ReLU(),
+            nn.Conv2d(reduced_channels, in_channels, kernel_size=1),
+            nn.Sigmoid(),
+        )
+
+    def forward(self, hidden_state):
+        # b c h w -> b c 1 1
+        pooled = self.pooler(hidden_state)
+        attention = self.attention(pooled)
+        hidden_state = hidden_state * attention
+        return hidden_state
+
+
+class RegNetXLayer(nn.Module):
+    """
+    RegNet's layer composed by three `3x3` convolutions, same as a ResNet bottleneck layer with reduction = 1.
+    """
+
+    def __init__(self, config: RegNetConfig, in_channels: int, out_channels: int, stride: int = 1):
+        super().__init__()
+        should_apply_shortcut = in_channels != out_channels or stride != 1
+        groups = max(1, out_channels // config.groups_width)
+        self.shortcut = (
+            RegNetShortCut(in_channels, out_channels, stride=stride) if should_apply_shortcut else nn.Identity()
+        )
+        self.layer = nn.Sequential(
+            RegNetConvLayer(in_channels, out_channels, kernel_size=1, activation=config.hidden_act),
+            RegNetConvLayer(out_channels, out_channels, stride=stride, groups=groups, activation=config.hidden_act),
+            RegNetConvLayer(out_channels, out_channels, kernel_size=1, activation=None),
+        )
+        self.activation = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_state):
+        residual = hidden_state
+        hidden_state = self.layer(hidden_state)
+        residual = self.shortcut(residual)
+        hidden_state += residual
+        hidden_state = self.activation(hidden_state)
+        return hidden_state
+
+
+class RegNetYLayer(nn.Module):
+    """
+    RegNet's Y layer: an X layer with Squeeze and Excitation.
+    """
+
+    def __init__(self, config: RegNetConfig, in_channels: int, out_channels: int, stride: int = 1):
+        super().__init__()
+        should_apply_shortcut = in_channels != out_channels or stride != 1
+        groups = max(1, out_channels // config.groups_width)
+        self.shortcut = (
+            RegNetShortCut(in_channels, out_channels, stride=stride) if should_apply_shortcut else nn.Identity()
+        )
+        self.layer = nn.Sequential(
+            RegNetConvLayer(in_channels, out_channels, kernel_size=1, activation=config.hidden_act),
+            RegNetConvLayer(out_channels, out_channels, stride=stride, groups=groups, activation=config.hidden_act),
+            RegNetSELayer(out_channels, reduced_channels=int(round(in_channels / 4))),
+            RegNetConvLayer(out_channels, out_channels, kernel_size=1, activation=None),
+        )
+        self.activation = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_state):
+        residual = hidden_state
+        hidden_state = self.layer(hidden_state)
+        residual = self.shortcut(residual)
+        hidden_state += residual
+        hidden_state = self.activation(hidden_state)
+        return hidden_state
+
+
+class RegNetStage(nn.Module):
+    """
+    A RegNet stage composed by stacked layers.
+    """
+
+    def __init__(
+        self,
+        config: RegNetConfig,
+        in_channels: int,
+        out_channels: int,
+        stride: int = 2,
+        depth: int = 2,
+    ):
+        super().__init__()
+
+        layer = RegNetXLayer if config.layer_type == "x" else RegNetYLayer
+
+        self.layers = nn.Sequential(
+            # downsampling is done in the first layer with stride of 2
+            layer(
+                config,
+                in_channels,
+                out_channels,
+                stride=stride,
+            ),
+            *[layer(config, out_channels, out_channels) for _ in range(depth - 1)],
+        )
+
+    def forward(self, hidden_state):
+        hidden_state = self.layers(hidden_state)
+        return hidden_state
+
+
+class RegNetEncoder(nn.Module):
+    def __init__(self, config: RegNetConfig):
+        super().__init__()
+        self.stages = nn.ModuleList([])
+        # based on `downsample_in_first_stage`, the first layer of the first stage may or may not downsample the input
+        self.stages.append(
+            RegNetStage(
+                config,
+                config.embedding_size,
+                config.hidden_sizes[0],
+                stride=2 if config.downsample_in_first_stage else 1,
+                depth=config.depths[0],
+            )
+        )
+        in_out_channels = zip(config.hidden_sizes, config.hidden_sizes[1:])
+        for (in_channels, out_channels), depth in zip(in_out_channels, config.depths[1:]):
+            self.stages.append(RegNetStage(config, in_channels, out_channels, depth=depth))
+
+    def forward(
+        self, hidden_state: Tensor, output_hidden_states: bool = False, return_dict: bool = True
+    ) -> BaseModelOutputWithNoAttention:
+        hidden_states = () if output_hidden_states else None
+
+        for stage_module in self.stages:
+            if output_hidden_states:
+                hidden_states = hidden_states + (hidden_state,)
+
+            hidden_state = stage_module(hidden_state)
+
+        if output_hidden_states:
+            hidden_states = hidden_states + (hidden_state,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_state, hidden_states] if v is not None)
+
+        return BaseModelOutputWithNoAttention(last_hidden_state=hidden_state, hidden_states=hidden_states)
+
+
+@auto_docstring
+class RegNetPreTrainedModel(PreTrainedModel):
+    config: RegNetConfig
+    base_model_prefix = "regnet"
+    main_input_name = "pixel_values"
+    _no_split_modules = ["RegNetYLayer"]
+
+    # Copied from transformers.models.resnet.modeling_resnet.ResNetPreTrainedModel._init_weights
+    def _init_weights(self, module):
+        if isinstance(module, nn.Conv2d):
+            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+        # copied from the `reset_parameters` method of `class Linear(Module)` in `torch`.
+        elif isinstance(module, nn.Linear):
+            nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5))
+            if module.bias is not None:
+                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight)
+                bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                nn.init.uniform_(module.bias, -bound, bound)
+        elif isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
+            nn.init.constant_(module.weight, 1)
+            nn.init.constant_(module.bias, 0)
+
+
+@auto_docstring
+# Copied from transformers.models.resnet.modeling_resnet.ResNetModel with RESNET->REGNET,ResNet->RegNet
+class RegNetModel(RegNetPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.embedder = RegNetEmbeddings(config)
+        self.encoder = RegNetEncoder(config)
+        self.pooler = nn.AdaptiveAvgPool2d((1, 1))
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
+    ) -> BaseModelOutputWithPoolingAndNoAttention:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        embedding_output = self.embedder(pixel_values)
+
+        encoder_outputs = self.encoder(
+            embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict
+        )
+
+        last_hidden_state = encoder_outputs[0]
+
+        pooled_output = self.pooler(last_hidden_state)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndNoAttention(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    RegNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
+    ImageNet.
+    """
+)
+# Copied from transformers.models.resnet.modeling_resnet.ResNetForImageClassification with RESNET->REGNET,ResNet->RegNet,resnet->regnet
+class RegNetForImageClassification(RegNetPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.regnet = RegNetModel(config)
+        # classification head
+        self.classifier = nn.Sequential(
+            nn.Flatten(),
+            nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity(),
+        )
+        # initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> ImageClassifierOutputWithNoAttention:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.regnet(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
+
+        pooled_output = outputs.pooler_output if return_dict else outputs[1]
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+
+        if labels is not None:
+            loss = self.loss_function(labels, logits, self.config)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return (loss,) + output if loss is not None else output
+
+        return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
+
+
+__all__ = ["RegNetForImageClassification", "RegNetModel", "RegNetPreTrainedModel"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/regnet/modeling_tf_regnet.py b/venv/lib/python3.13/site-packages/transformers/models/regnet/modeling_tf_regnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..13714b4e69aa6d788779a08e42dfe4ea8418367c
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/regnet/modeling_tf_regnet.py
@@ -0,0 +1,611 @@
+# coding=utf-8
+# Copyright 2022 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TensorFlow RegNet model."""
+
+from typing import Optional, Union
+
+import tensorflow as tf
+
+from ...activations_tf import ACT2FN
+from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
+from ...modeling_tf_outputs import (
+    TFBaseModelOutputWithNoAttention,
+    TFBaseModelOutputWithPoolingAndNoAttention,
+    TFSequenceClassifierOutput,
+)
+from ...modeling_tf_utils import (
+    TFPreTrainedModel,
+    TFSequenceClassificationLoss,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import shape_list
+from ...utils import logging
+from .configuration_regnet import RegNetConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "RegNetConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/regnet-y-040"
+_EXPECTED_OUTPUT_SHAPE = [1, 1088, 7, 7]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "facebook/regnet-y-040"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+
+
+class TFRegNetConvLayer(keras.layers.Layer):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        stride: int = 1,
+        groups: int = 1,
+        activation: Optional[str] = "relu",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        # The padding and conv has been verified in
+        # https://colab.research.google.com/gist/sayakpaul/854bc10eeaf21c9ee2119e0b9f3841a7/scratchpad.ipynb
+        self.padding = keras.layers.ZeroPadding2D(padding=kernel_size // 2)
+        self.convolution = keras.layers.Conv2D(
+            filters=out_channels,
+            kernel_size=kernel_size,
+            strides=stride,
+            padding="VALID",
+            groups=groups,
+            use_bias=False,
+            name="convolution",
+        )
+        self.normalization = keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization")
+        self.activation = ACT2FN[activation] if activation is not None else tf.identity
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+    def call(self, hidden_state):
+        hidden_state = self.convolution(self.padding(hidden_state))
+        hidden_state = self.normalization(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        return hidden_state
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "convolution", None) is not None:
+            with tf.name_scope(self.convolution.name):
+                self.convolution.build([None, None, None, self.in_channels])
+        if getattr(self, "normalization", None) is not None:
+            with tf.name_scope(self.normalization.name):
+                self.normalization.build([None, None, None, self.out_channels])
+
+
+class TFRegNetEmbeddings(keras.layers.Layer):
+    """
+    RegNet Embeddings (stem) composed of a single aggressive convolution.
+    """
+
+    def __init__(self, config: RegNetConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.num_channels = config.num_channels
+        self.embedder = TFRegNetConvLayer(
+            in_channels=config.num_channels,
+            out_channels=config.embedding_size,
+            kernel_size=3,
+            stride=2,
+            activation=config.hidden_act,
+            name="embedder",
+        )
+
+    def call(self, pixel_values):
+        num_channels = shape_list(pixel_values)[1]
+        if tf.executing_eagerly() and num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+
+        # When running on CPU, `keras.layers.Conv2D` doesn't support `NCHW` format.
+        # So change the input format from `NCHW` to `NHWC`.
+        # shape = (batch_size, in_height, in_width, in_channels=num_channels)
+        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
+        hidden_state = self.embedder(pixel_values)
+        return hidden_state
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embedder", None) is not None:
+            with tf.name_scope(self.embedder.name):
+                self.embedder.build(None)
+
+
+class TFRegNetShortCut(keras.layers.Layer):
+    """
+    RegNet shortcut, used to project the residual features to the correct size. If needed, it is also used to
+    downsample the input using `stride=2`.
+    """
+
+    def __init__(self, in_channels: int, out_channels: int, stride: int = 2, **kwargs):
+        super().__init__(**kwargs)
+        self.convolution = keras.layers.Conv2D(
+            filters=out_channels, kernel_size=1, strides=stride, use_bias=False, name="convolution"
+        )
+        self.normalization = keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization")
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+    def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor:
+        return self.normalization(self.convolution(inputs), training=training)
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "convolution", None) is not None:
+            with tf.name_scope(self.convolution.name):
+                self.convolution.build([None, None, None, self.in_channels])
+        if getattr(self, "normalization", None) is not None:
+            with tf.name_scope(self.normalization.name):
+                self.normalization.build([None, None, None, self.out_channels])
+
+
+class TFRegNetSELayer(keras.layers.Layer):
+    """
+    Squeeze and Excitation layer (SE) proposed in [Squeeze-and-Excitation Networks](https://huggingface.co/papers/1709.01507).
+    """
+
+    def __init__(self, in_channels: int, reduced_channels: int, **kwargs):
+        super().__init__(**kwargs)
+        self.pooler = keras.layers.GlobalAveragePooling2D(keepdims=True, name="pooler")
+        self.attention = [
+            keras.layers.Conv2D(filters=reduced_channels, kernel_size=1, activation="relu", name="attention.0"),
+            keras.layers.Conv2D(filters=in_channels, kernel_size=1, activation="sigmoid", name="attention.2"),
+        ]
+        self.in_channels = in_channels
+        self.reduced_channels = reduced_channels
+
+    def call(self, hidden_state):
+        # [batch_size, h, w, num_channels] -> [batch_size, 1, 1, num_channels]
+        pooled = self.pooler(hidden_state)
+        for layer_module in self.attention:
+            pooled = layer_module(pooled)
+        hidden_state = hidden_state * pooled
+        return hidden_state
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "pooler", None) is not None:
+            with tf.name_scope(self.pooler.name):
+                self.pooler.build((None, None, None, None))
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention[0].name):
+                self.attention[0].build([None, None, None, self.in_channels])
+            with tf.name_scope(self.attention[1].name):
+                self.attention[1].build([None, None, None, self.reduced_channels])
+
+
+class TFRegNetXLayer(keras.layers.Layer):
+    """
+    RegNet's layer composed by three `3x3` convolutions, same as a ResNet bottleneck layer with reduction = 1.
+    """
+
+    def __init__(self, config: RegNetConfig, in_channels: int, out_channels: int, stride: int = 1, **kwargs):
+        super().__init__(**kwargs)
+        should_apply_shortcut = in_channels != out_channels or stride != 1
+        groups = max(1, out_channels // config.groups_width)
+        self.shortcut = (
+            TFRegNetShortCut(in_channels, out_channels, stride=stride, name="shortcut")
+            if should_apply_shortcut
+            else keras.layers.Activation("linear", name="shortcut")
+        )
+        # `self.layers` instead of `self.layer` because that is a reserved argument.
+        self.layers = [
+            TFRegNetConvLayer(in_channels, out_channels, kernel_size=1, activation=config.hidden_act, name="layer.0"),
+            TFRegNetConvLayer(
+                out_channels, out_channels, stride=stride, groups=groups, activation=config.hidden_act, name="layer.1"
+            ),
+            TFRegNetConvLayer(out_channels, out_channels, kernel_size=1, activation=None, name="layer.2"),
+        ]
+        self.activation = ACT2FN[config.hidden_act]
+
+    def call(self, hidden_state):
+        residual = hidden_state
+        for layer_module in self.layers:
+            hidden_state = layer_module(hidden_state)
+        residual = self.shortcut(residual)
+        hidden_state += residual
+        hidden_state = self.activation(hidden_state)
+        return hidden_state
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "shortcut", None) is not None:
+            with tf.name_scope(self.shortcut.name):
+                self.shortcut.build(None)
+        if getattr(self, "layers", None) is not None:
+            for layer in self.layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+class TFRegNetYLayer(keras.layers.Layer):
+    """
+    RegNet's Y layer: an X layer with Squeeze and Excitation.
+    """
+
+    def __init__(self, config: RegNetConfig, in_channels: int, out_channels: int, stride: int = 1, **kwargs):
+        super().__init__(**kwargs)
+        should_apply_shortcut = in_channels != out_channels or stride != 1
+        groups = max(1, out_channels // config.groups_width)
+        self.shortcut = (
+            TFRegNetShortCut(in_channels, out_channels, stride=stride, name="shortcut")
+            if should_apply_shortcut
+            else keras.layers.Activation("linear", name="shortcut")
+        )
+        self.layers = [
+            TFRegNetConvLayer(in_channels, out_channels, kernel_size=1, activation=config.hidden_act, name="layer.0"),
+            TFRegNetConvLayer(
+                out_channels, out_channels, stride=stride, groups=groups, activation=config.hidden_act, name="layer.1"
+            ),
+            TFRegNetSELayer(out_channels, reduced_channels=int(round(in_channels / 4)), name="layer.2"),
+            TFRegNetConvLayer(out_channels, out_channels, kernel_size=1, activation=None, name="layer.3"),
+        ]
+        self.activation = ACT2FN[config.hidden_act]
+
+    def call(self, hidden_state):
+        residual = hidden_state
+        for layer_module in self.layers:
+            hidden_state = layer_module(hidden_state)
+        residual = self.shortcut(residual)
+        hidden_state += residual
+        hidden_state = self.activation(hidden_state)
+        return hidden_state
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "shortcut", None) is not None:
+            with tf.name_scope(self.shortcut.name):
+                self.shortcut.build(None)
+        if getattr(self, "layers", None) is not None:
+            for layer in self.layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+class TFRegNetStage(keras.layers.Layer):
+    """
+    A RegNet stage composed by stacked layers.
+    """
+
+    def __init__(
+        self, config: RegNetConfig, in_channels: int, out_channels: int, stride: int = 2, depth: int = 2, **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        layer = TFRegNetXLayer if config.layer_type == "x" else TFRegNetYLayer
+        self.layers = [
+            # downsampling is done in the first layer with stride of 2
+            layer(config, in_channels, out_channels, stride=stride, name="layers.0"),
+            *[layer(config, out_channels, out_channels, name=f"layers.{i + 1}") for i in range(depth - 1)],
+        ]
+
+    def call(self, hidden_state):
+        for layer_module in self.layers:
+            hidden_state = layer_module(hidden_state)
+        return hidden_state
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layers", None) is not None:
+            for layer in self.layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+class TFRegNetEncoder(keras.layers.Layer):
+    def __init__(self, config: RegNetConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.stages = []
+        # based on `downsample_in_first_stage`, the first layer of the first stage may or may not downsample the input
+        self.stages.append(
+            TFRegNetStage(
+                config,
+                config.embedding_size,
+                config.hidden_sizes[0],
+                stride=2 if config.downsample_in_first_stage else 1,
+                depth=config.depths[0],
+                name="stages.0",
+            )
+        )
+        in_out_channels = zip(config.hidden_sizes, config.hidden_sizes[1:])
+        for i, ((in_channels, out_channels), depth) in enumerate(zip(in_out_channels, config.depths[1:])):
+            self.stages.append(TFRegNetStage(config, in_channels, out_channels, depth=depth, name=f"stages.{i + 1}"))
+
+    def call(
+        self, hidden_state: tf.Tensor, output_hidden_states: bool = False, return_dict: bool = True
+    ) -> TFBaseModelOutputWithNoAttention:
+        hidden_states = () if output_hidden_states else None
+
+        for stage_module in self.stages:
+            if output_hidden_states:
+                hidden_states = hidden_states + (hidden_state,)
+
+            hidden_state = stage_module(hidden_state)
+
+        if output_hidden_states:
+            hidden_states = hidden_states + (hidden_state,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_state, hidden_states] if v is not None)
+
+        return TFBaseModelOutputWithNoAttention(last_hidden_state=hidden_state, hidden_states=hidden_states)
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        for stage in self.stages:
+            with tf.name_scope(stage.name):
+                stage.build(None)
+
+
+@keras_serializable
+class TFRegNetMainLayer(keras.layers.Layer):
+    config_class = RegNetConfig
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.embedder = TFRegNetEmbeddings(config, name="embedder")
+        self.encoder = TFRegNetEncoder(config, name="encoder")
+        self.pooler = keras.layers.GlobalAveragePooling2D(keepdims=True, name="pooler")
+
+    @unpack_inputs
+    def call(
+        self,
+        pixel_values: tf.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> TFBaseModelOutputWithPoolingAndNoAttention:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        embedding_output = self.embedder(pixel_values, training=training)
+
+        encoder_outputs = self.encoder(
+            embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict, training=training
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = self.pooler(last_hidden_state)
+
+        # Change to NCHW output format have uniformity in the modules
+        pooled_output = tf.transpose(pooled_output, perm=(0, 3, 1, 2))
+        last_hidden_state = tf.transpose(last_hidden_state, perm=(0, 3, 1, 2))
+
+        # Change the other hidden state outputs to NCHW as well
+        if output_hidden_states:
+            hidden_states = tuple(tf.transpose(h, perm=(0, 3, 1, 2)) for h in encoder_outputs[1])
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return TFBaseModelOutputWithPoolingAndNoAttention(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embedder", None) is not None:
+            with tf.name_scope(self.embedder.name):
+                self.embedder.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "pooler", None) is not None:
+            with tf.name_scope(self.pooler.name):
+                self.pooler.build((None, None, None, None))
+
+
+class TFRegNetPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = RegNetConfig
+    base_model_prefix = "regnet"
+    main_input_name = "pixel_values"
+
+    @property
+    def input_signature(self):
+        return {"pixel_values": tf.TensorSpec(shape=(None, self.config.num_channels, 224, 224), dtype=tf.float32)}
+
+
+REGNET_START_DOCSTRING = r"""
+    This model is a Tensorflow
+    [keras.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer) sub-class. Use it as a
+    regular Tensorflow Module and refer to the Tensorflow documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`RegNetConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+REGNET_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`ConveNextImageProcessor.__call__`] for details.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare RegNet model outputting raw features without any specific head on top.",
+    REGNET_START_DOCSTRING,
+)
+class TFRegNetModel(TFRegNetPreTrainedModel):
+    def __init__(self, config: RegNetConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.regnet = TFRegNetMainLayer(config, name="regnet")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(REGNET_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutputWithPoolingAndNoAttention,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def call(
+        self,
+        pixel_values: tf.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[TFBaseModelOutputWithPoolingAndNoAttention, tuple[tf.Tensor]]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.regnet(
+            pixel_values=pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        if not return_dict:
+            return (outputs[0],) + outputs[1:]
+
+        return TFBaseModelOutputWithPoolingAndNoAttention(
+            last_hidden_state=outputs.last_hidden_state,
+            pooler_output=outputs.pooler_output,
+            hidden_states=outputs.hidden_states,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "regnet", None) is not None:
+            with tf.name_scope(self.regnet.name):
+                self.regnet.build(None)
+
+
+@add_start_docstrings(
+    """
+    RegNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
+    ImageNet.
+    """,
+    REGNET_START_DOCSTRING,
+)
+class TFRegNetForImageClassification(TFRegNetPreTrainedModel, TFSequenceClassificationLoss):
+    def __init__(self, config: RegNetConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+        self.regnet = TFRegNetMainLayer(config, name="regnet")
+        # classification head
+        self.classifier = [
+            keras.layers.Flatten(),
+            keras.layers.Dense(config.num_labels, name="classifier.1") if config.num_labels > 0 else tf.identity,
+        ]
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(REGNET_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=TFSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def call(
+        self,
+        pixel_values: Optional[tf.Tensor] = None,
+        labels: Optional[tf.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[TFSequenceClassifierOutput, tuple[tf.Tensor]]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.regnet(
+            pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict, training=training
+        )
+
+        pooled_output = outputs.pooler_output if return_dict else outputs[1]
+
+        flattened_output = self.classifier[0](pooled_output)
+        logits = self.classifier[1](flattened_output)
+
+        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFSequenceClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "regnet", None) is not None:
+            with tf.name_scope(self.regnet.name):
+                self.regnet.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier[1].name):
+                self.classifier[1].build([None, None, None, self.config.hidden_sizes[-1]])
+
+
+__all__ = ["TFRegNetForImageClassification", "TFRegNetModel", "TFRegNetPreTrainedModel"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/roformer/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/roformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..63c1c00e572391222d4f5b33d18401d0afd699ee
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/roformer/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_roformer import *
+    from .modeling_flax_roformer import *
+    from .modeling_roformer import *
+    from .modeling_tf_roformer import *
+    from .tokenization_roformer import *
+    from .tokenization_roformer_fast import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/roformer/configuration_roformer.py b/venv/lib/python3.13/site-packages/transformers/models/roformer/configuration_roformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..876b42a9225d992bed5171cfe8ea058c56f73ec7
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/roformer/configuration_roformer.py
@@ -0,0 +1,150 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""RoFormer model configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class RoFormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`RoFormerModel`]. It is used to instantiate an
+    RoFormer model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the RoFormer
+    [junnyu/roformer_chinese_base](https://huggingface.co/junnyu/roformer_chinese_base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50000):
+            Vocabulary size of the RoFormer model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`RoFormerModel`] or [`TFRoFormerModel`].
+        embedding_size (`int`, *optional*, defaults to None):
+            Dimensionality of the encoder layers and the pooler layer. Defaults to the `hidden_size` if not provided.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 1536):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 1536).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`RoFormerModel`] or [`TFRoFormerModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        rotary_value (`bool`, *optional*, defaults to `False`):
+            Whether or not apply rotary position embeddings on value layer.
+
+    Example:
+
+    ```python
+    >>> from transformers import RoFormerModel, RoFormerConfig
+
+    >>> # Initializing a RoFormer junnyu/roformer_chinese_base style configuration
+    >>> configuration = RoFormerConfig()
+
+    >>> # Initializing a model (with random weights) from the junnyu/roformer_chinese_base style configuration
+    >>> model = RoFormerModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "roformer"
+
+    def __init__(
+        self,
+        vocab_size=50000,
+        embedding_size=None,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=1536,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        rotary_value=False,
+        use_cache=True,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.embedding_size = hidden_size if embedding_size is None else embedding_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.rotary_value = rotary_value
+        self.use_cache = use_cache
+
+
+class RoFormerOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "multiple-choice":
+            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+        else:
+            dynamic_axis = {0: "batch", 1: "sequence"}
+        dynamic_axis = {0: "batch", 1: "sequence"}
+        return OrderedDict(
+            [
+                ("input_ids", dynamic_axis),
+                ("attention_mask", dynamic_axis),
+                ("token_type_ids", dynamic_axis),
+            ]
+        )
+
+
+__all__ = ["RoFormerConfig", "RoFormerOnnxConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/roformer/modeling_flax_roformer.py b/venv/lib/python3.13/site-packages/transformers/models/roformer/modeling_flax_roformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..de78eb4787c0ced5261f8d3572450b77ccd131d9
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/roformer/modeling_flax_roformer.py
@@ -0,0 +1,1091 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Flax RoFormer model."""
+
+from typing import Callable, Optional
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen.attention import dot_product_attention_weights
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+
+from ...modeling_flax_outputs import (
+    FlaxBaseModelOutput,
+    FlaxMaskedLMOutput,
+    FlaxMultipleChoiceModelOutput,
+    FlaxQuestionAnsweringModelOutput,
+    FlaxSequenceClassifierOutput,
+    FlaxTokenClassifierOutput,
+)
+from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring, overwrite_call_docstring
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_roformer import RoFormerConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "junnyu/roformer_chinese_base"
+_CONFIG_FOR_DOC = "RoFormerConfig"
+
+
+ROFORMER_START_DOCSTRING = r"""
+
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)
+
+    This model is also a
+    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
+    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
+    behavior.
+
+    Finally, this model supports inherent JAX features such as:
+
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        config ([`RoFormerConfig`]): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs).
+
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
+"""
+
+ROFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`numpy.ndarray` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+        head_mask (`numpy.ndarray` of shape `({0})`, `optional):
+            Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.marian.modeling_flax_marian.create_sinusoidal_positions
+def create_sinusoidal_positions(n_pos, dim):
+    position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
+    sentinel = dim // 2 + dim % 2
+    out = np.zeros_like(position_enc)
+    out[:, 0:sentinel] = np.sin(position_enc[:, 0::2])
+    out[:, sentinel:] = np.cos(position_enc[:, 1::2])
+
+    return jnp.array(out)
+
+
+class FlaxRoFormerEmbeddings(nn.Module):
+    """Construct the embeddings from word and token_type embeddings."""
+
+    config: RoFormerConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.word_embeddings = nn.Embed(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+        self.token_type_embeddings = nn.Embed(
+            self.config.type_vocab_size,
+            self.config.hidden_size,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+
+    def __call__(self, input_ids, token_type_ids, attention_mask, deterministic: bool = True):
+        # Embed
+        inputs_embeds = self.word_embeddings(input_ids.astype("i4"))
+        token_type_embeddings = self.token_type_embeddings(token_type_ids.astype("i4"))
+
+        # Sum all embeddings
+        hidden_states = inputs_embeds + token_type_embeddings
+
+        # Layer Norm
+        hidden_states = self.LayerNorm(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        return hidden_states
+
+
+class FlaxRoFormerSelfAttention(nn.Module):
+    config: RoFormerConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self) -> None:
+        if self.config.hidden_size % self.config.num_attention_heads != 0:
+            raise ValueError(
+                "`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads` "
+                "                   : {self.config.num_attention_heads}"
+            )
+
+        self.query = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.key = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.value = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+
+        self.rotary_value = self.config.rotary_value
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        sinusoidal_pos,
+        layer_head_mask,
+        deterministic=True,
+        output_attentions: bool = False,
+    ):
+        head_dim = self.config.hidden_size // self.config.num_attention_heads
+
+        query_states = self.query(hidden_states).reshape(
+            hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim)
+        )
+        value_states = self.value(hidden_states).reshape(
+            hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim)
+        )
+        key_states = self.key(hidden_states).reshape(
+            hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim)
+        )
+
+        if sinusoidal_pos is not None:
+            if self.rotary_value:
+                query_states, key_states, value_states = self.apply_rotary_position_embeddings(
+                    sinusoidal_pos, query_states, key_states, value_states
+                )
+            else:
+                query_states, key_states = self.apply_rotary_position_embeddings(
+                    sinusoidal_pos, query_states, key_states
+                )
+
+        # Convert the boolean attention mask to an attention bias.
+        if attention_mask is not None:
+            # attention mask in the form of attention bias
+            attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
+            attention_bias = lax.select(
+                attention_mask > 0,
+                jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
+            )
+        else:
+            attention_bias = None
+
+        dropout_rng = None
+        if not deterministic and self.config.attention_probs_dropout_prob > 0.0:
+            dropout_rng = self.make_rng("dropout")
+
+        attn_weights = dot_product_attention_weights(
+            query_states,
+            key_states,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.config.attention_probs_dropout_prob,
+            broadcast_dropout=True,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            precision=None,
+        )
+
+        # Mask heads if we want to
+        if layer_head_mask is not None:
+            attn_weights = jnp.einsum("...hqk,h->...hqk", attn_weights, layer_head_mask)
+
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
+        attn_output = attn_output.reshape(attn_output.shape[:2] + (-1,))
+
+        outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
+        return outputs
+
+    @staticmethod
+    def apply_rotary_position_embeddings(sinusoidal_pos, query_layer, key_layer, value_layer=None):
+        sin, cos = jnp.split(sinusoidal_pos, 2, axis=-1)
+        sin_pos = jnp.stack([sin, sin], axis=-1).reshape(sinusoidal_pos.shape)
+        cos_pos = jnp.stack([cos, cos], axis=-1).reshape(sinusoidal_pos.shape)
+
+        def rotate_layer(layer, sin_pos, cos_pos):
+            rotate_half_layer = jnp.stack([-layer[..., 1::2], layer[..., ::2]], axis=-1).reshape(layer.shape)
+            rotary_matrix_cos = jnp.einsum("bslh,...sh->bslh", layer, cos_pos)
+            rotary_matrix_sin = jnp.einsum("bslh,...sh->bslh", rotate_half_layer, sin_pos)
+            return rotary_matrix_cos + rotary_matrix_sin
+
+        query_layer = rotate_layer(query_layer, sin_pos, cos_pos)
+        key_layer = rotate_layer(key_layer, sin_pos, cos_pos)
+        if value_layer is not None:
+            value_layer = rotate_layer(value_layer, sin_pos, cos_pos)
+            return query_layer, key_layer, value_layer
+        return query_layer, key_layer
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertSelfOutput with Bert->RoFormer
+class FlaxRoFormerSelfOutput(nn.Module):
+    config: RoFormerConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+
+    def __call__(self, hidden_states, input_tensor, deterministic: bool = True):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class FlaxRoFormerAttention(nn.Module):
+    config: RoFormerConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.self = FlaxRoFormerSelfAttention(self.config, dtype=self.dtype)
+        self.output = FlaxRoFormerSelfOutput(self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        sinusoidal_pos,
+        layer_head_mask,
+        deterministic=True,
+        output_attentions: bool = False,
+    ):
+        # Attention mask comes in as attention_mask.shape == (*batch_sizes, kv_length)
+        # FLAX expects: attention_mask.shape == (*batch_sizes, 1, 1, kv_length) such that it is broadcastable
+        # with attn_weights.shape == (*batch_sizes, num_heads, q_length, kv_length)
+        attn_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            sinusoidal_pos,
+            layer_head_mask=layer_head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+        )
+        attn_output = attn_outputs[0]
+        hidden_states = self.output(attn_output, hidden_states, deterministic=deterministic)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_outputs[1],)
+
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertIntermediate with Bert->RoFormer
+class FlaxRoFormerIntermediate(nn.Module):
+    config: RoFormerConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.intermediate_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.activation = ACT2FN[self.config.hidden_act]
+
+    def __call__(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertOutput with Bert->RoFormer
+class FlaxRoFormerOutput(nn.Module):
+    config: RoFormerConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+
+    def __call__(self, hidden_states, attention_output, deterministic: bool = True):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = self.LayerNorm(hidden_states + attention_output)
+        return hidden_states
+
+
+class FlaxRoFormerLayer(nn.Module):
+    config: RoFormerConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.attention = FlaxRoFormerAttention(self.config, dtype=self.dtype)
+        self.intermediate = FlaxRoFormerIntermediate(self.config, dtype=self.dtype)
+        self.output = FlaxRoFormerOutput(self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        sinusiodal_pos,
+        layer_head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+    ):
+        attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            sinusiodal_pos,
+            layer_head_mask=layer_head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+        )
+        attention_output = attention_outputs[0]
+
+        hidden_states = self.intermediate(attention_output)
+        hidden_states = self.output(hidden_states, attention_output, deterministic=deterministic)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attention_outputs[1],)
+        return outputs
+
+
+class FlaxRoFormerLayerCollection(nn.Module):
+    config: RoFormerConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.layers = [
+            FlaxRoFormerLayer(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.num_hidden_layers)
+        ]
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        sinusoidal_pos,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        # Check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.shape[0] != (len(self.layers)):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for                  "
+                    f"       {head_mask.shape[0]}."
+                )
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = layer(
+                hidden_states,
+                attention_mask,
+                sinusoidal_pos,
+                layer_head_mask=head_mask[i] if head_mask is not None else None,
+                deterministic=deterministic,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions += (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        outputs = (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
+        )
+
+
+class FlaxRoFormerEncoder(nn.Module):
+    config: RoFormerConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.embed_positions = create_sinusoidal_positions(
+            self.config.max_position_embeddings, self.config.hidden_size // self.config.num_attention_heads
+        )
+        self.layer = FlaxRoFormerLayerCollection(self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        sinusoidal_pos = self.embed_positions[: hidden_states.shape[1], :]
+
+        return self.layer(
+            hidden_states,
+            attention_mask,
+            sinusoidal_pos,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPredictionHeadTransform with Bert->RoFormer
+class FlaxRoFormerPredictionHeadTransform(nn.Module):
+    config: RoFormerConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.dense = nn.Dense(self.config.hidden_size, dtype=self.dtype)
+        self.activation = ACT2FN[self.config.hidden_act]
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+
+    def __call__(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return self.LayerNorm(hidden_states)
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLMPredictionHead with Bert->RoFormer
+class FlaxRoFormerLMPredictionHead(nn.Module):
+    config: RoFormerConfig
+    dtype: jnp.dtype = jnp.float32
+    bias_init: Callable[..., np.ndarray] = jax.nn.initializers.zeros
+
+    def setup(self):
+        self.transform = FlaxRoFormerPredictionHeadTransform(self.config, dtype=self.dtype)
+        self.decoder = nn.Dense(self.config.vocab_size, dtype=self.dtype, use_bias=False)
+        self.bias = self.param("bias", self.bias_init, (self.config.vocab_size,))
+
+    def __call__(self, hidden_states, shared_embedding=None):
+        hidden_states = self.transform(hidden_states)
+
+        if shared_embedding is not None:
+            hidden_states = self.decoder.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
+        else:
+            hidden_states = self.decoder(hidden_states)
+
+        bias = jnp.asarray(self.bias, self.dtype)
+        hidden_states += bias
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertOnlyMLMHead with Bert->RoFormer
+class FlaxRoFormerOnlyMLMHead(nn.Module):
+    config: RoFormerConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.predictions = FlaxRoFormerLMPredictionHead(self.config, dtype=self.dtype)
+
+    def __call__(self, hidden_states, shared_embedding=None):
+        hidden_states = self.predictions(hidden_states, shared_embedding=shared_embedding)
+        return hidden_states
+
+
+class FlaxRoFormerClassificationHead(nn.Module):
+    config: RoFormerConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+        self.out_proj = nn.Dense(
+            self.config.num_labels,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.activation = ACT2FN[self.config.hidden_act]
+
+    def __call__(self, hidden_states, deterministic=True):
+        hidden_states = hidden_states[:, 0, :]  # take <s> token (equiv. to [CLS])
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+
+class FlaxRoFormerPreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = RoFormerConfig
+    base_model_prefix = "roformer"
+    module_class: nn.Module = None
+
+    def __init__(
+        self,
+        config: RoFormerConfig,
+        input_shape: tuple = (1, 1),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_ids = jnp.zeros(input_shape, dtype="i4")
+        token_type_ids = jnp.zeros_like(input_ids)
+        attention_mask = jnp.ones_like(input_ids)
+        head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads))
+
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        random_params = self.module.init(
+            rngs, input_ids, attention_mask, token_type_ids, head_mask, return_dict=False
+        )["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        head_mask=None,
+        params: Optional[dict] = None,
+        dropout_rng: jax.random.PRNGKey = None,
+        train: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        # init input tensors if not passed
+        if token_type_ids is None:
+            token_type_ids = jnp.zeros_like(input_ids)
+
+        if attention_mask is None:
+            attention_mask = jnp.ones_like(input_ids)
+
+        if head_mask is None:
+            head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads))
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        return self.module.apply(
+            {"params": params or self.params},
+            jnp.array(input_ids, dtype="i4"),
+            jnp.array(attention_mask, dtype="i4"),
+            jnp.array(token_type_ids, dtype="i4"),
+            jnp.array(head_mask, dtype="i4"),
+            not train,
+            output_attentions,
+            output_hidden_states,
+            return_dict,
+            rngs=rngs,
+        )
+
+
+class FlaxRoFormerModule(nn.Module):
+    config: RoFormerConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.embeddings = FlaxRoFormerEmbeddings(self.config, dtype=self.dtype)
+        self.encoder = FlaxRoFormerEncoder(self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        hidden_states = self.embeddings(input_ids, token_type_ids, attention_mask, deterministic=deterministic)
+        outputs = self.encoder(
+            hidden_states,
+            attention_mask,
+            head_mask=head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+
+        if not return_dict:
+            return (hidden_states,) + outputs[1:]
+
+        return FlaxBaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    "The bare RoFormer Model transformer outputting raw hidden-states without any specific head on top.",
+    ROFORMER_START_DOCSTRING,
+)
+class FlaxRoFormerModel(FlaxRoFormerPreTrainedModel):
+    module_class = FlaxRoFormerModule
+
+
+append_call_sample_docstring(FlaxRoFormerModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC)
+
+
+class FlaxRoFormerForMaskedLMModule(nn.Module):
+    config: RoFormerConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.roformer = FlaxRoFormerModule(config=self.config, dtype=self.dtype)
+        self.cls = FlaxRoFormerOnlyMLMHead(config=self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.roformer(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        if self.config.tie_word_embeddings:
+            shared_embedding = self.roformer.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
+        else:
+            shared_embedding = None
+
+        # Compute the prediction scores
+        logits = self.cls(hidden_states, shared_embedding=shared_embedding)
+
+        if not return_dict:
+            return (logits,) + outputs[1:]
+
+        return FlaxMaskedLMOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings("""RoFormer Model with a `language modeling` head on top.""", ROFORMER_START_DOCSTRING)
+class FlaxRoFormerForMaskedLM(FlaxRoFormerPreTrainedModel):
+    module_class = FlaxRoFormerForMaskedLMModule
+
+
+append_call_sample_docstring(
+    FlaxRoFormerForMaskedLM,
+    _CHECKPOINT_FOR_DOC,
+    FlaxMaskedLMOutput,
+    _CONFIG_FOR_DOC,
+    mask="<mask>",
+)
+
+
+class FlaxRoFormerForSequenceClassificationModule(nn.Module):
+    config: RoFormerConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.roformer = FlaxRoFormerModule(config=self.config, dtype=self.dtype)
+        self.classifier = FlaxRoFormerClassificationHead(config=self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.roformer(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output, deterministic=deterministic)
+
+        if not return_dict:
+            return (logits,) + outputs[1:]
+
+        return FlaxSequenceClassifierOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    RoFormer Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
+    ROFORMER_START_DOCSTRING,
+)
+class FlaxRoFormerForSequenceClassification(FlaxRoFormerPreTrainedModel):
+    module_class = FlaxRoFormerForSequenceClassificationModule
+
+
+append_call_sample_docstring(
+    FlaxRoFormerForSequenceClassification,
+    _CHECKPOINT_FOR_DOC,
+    FlaxSequenceClassifierOutput,
+    _CONFIG_FOR_DOC,
+)
+
+
+class FlaxRoFormerForMultipleChoiceModule(nn.Module):
+    config: RoFormerConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.roformer = FlaxRoFormerModule(config=self.config, dtype=self.dtype)
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+        self.classifier = nn.Dense(1, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        num_choices = input_ids.shape[1]
+        input_ids = input_ids.reshape(-1, input_ids.shape[-1])
+        attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1])
+        token_type_ids = token_type_ids.reshape(-1, token_type_ids.shape[-1])
+
+        # Model
+        outputs = self.roformer(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        # Equivalent to sequence_summary call in the PyTorch implementation
+        hidden_states = outputs[0]
+        pooled_output = hidden_states[:, -1]
+        pooled_output = self.dropout(pooled_output, deterministic=deterministic)
+
+        logits = self.classifier(pooled_output)
+
+        reshaped_logits = logits.reshape(-1, num_choices)
+
+        if not return_dict:
+            return (reshaped_logits,) + outputs[2:]
+
+        return FlaxMultipleChoiceModelOutput(
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    RoFormer Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    ROFORMER_START_DOCSTRING,
+)
+class FlaxRoFormerForMultipleChoice(FlaxRoFormerPreTrainedModel):
+    module_class = FlaxRoFormerForMultipleChoiceModule
+
+
+overwrite_call_docstring(
+    FlaxRoFormerForMultipleChoice, ROFORMER_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+)
+append_call_sample_docstring(
+    FlaxRoFormerForMultipleChoice,
+    _CHECKPOINT_FOR_DOC,
+    FlaxMultipleChoiceModelOutput,
+    _CONFIG_FOR_DOC,
+)
+
+
+class FlaxRoFormerForTokenClassificationModule(nn.Module):
+    config: RoFormerConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.roformer = FlaxRoFormerModule(config=self.config, dtype=self.dtype)
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+        self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.roformer(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        logits = self.classifier(hidden_states)
+
+        if not return_dict:
+            return (logits,) + outputs[1:]
+
+        return FlaxTokenClassifierOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    RoFormer Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    ROFORMER_START_DOCSTRING,
+)
+class FlaxRoFormerForTokenClassification(FlaxRoFormerPreTrainedModel):
+    module_class = FlaxRoFormerForTokenClassificationModule
+
+
+append_call_sample_docstring(
+    FlaxRoFormerForTokenClassification,
+    _CHECKPOINT_FOR_DOC,
+    FlaxTokenClassifierOutput,
+    _CONFIG_FOR_DOC,
+)
+
+
+class FlaxRoFormerForQuestionAnsweringModule(nn.Module):
+    config: RoFormerConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.roformer = FlaxRoFormerModule(config=self.config, dtype=self.dtype)
+        self.qa_outputs = nn.Dense(self.config.num_labels, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.roformer(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+
+        logits = self.qa_outputs(hidden_states)
+        start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        if not return_dict:
+            return (start_logits, end_logits) + outputs[1:]
+
+        return FlaxQuestionAnsweringModelOutput(
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    RoFormer Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    ROFORMER_START_DOCSTRING,
+)
+class FlaxRoFormerForQuestionAnswering(FlaxRoFormerPreTrainedModel):
+    module_class = FlaxRoFormerForQuestionAnsweringModule
+
+
+append_call_sample_docstring(
+    FlaxRoFormerForQuestionAnswering,
+    _CHECKPOINT_FOR_DOC,
+    FlaxQuestionAnsweringModelOutput,
+    _CONFIG_FOR_DOC,
+)
+
+
+__all__ = [
+    "FlaxRoFormerForMaskedLM",
+    "FlaxRoFormerForMultipleChoice",
+    "FlaxRoFormerForQuestionAnswering",
+    "FlaxRoFormerForSequenceClassification",
+    "FlaxRoFormerForTokenClassification",
+    "FlaxRoFormerModel",
+    "FlaxRoFormerPreTrainedModel",
+]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/roformer/modeling_roformer.py b/venv/lib/python3.13/site-packages/transformers/models/roformer/modeling_roformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fc94cf87675b548101cf3b5eca2c0add4170ef3
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/roformer/modeling_roformer.py
@@ -0,0 +1,1505 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch RoFormer model."""
+
+import math
+import os
+from typing import Callable, Optional, Union
+
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN, get_activation
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
+from ...generation import GenerationMixin
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import auto_docstring, logging
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_roformer import RoFormerConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.marian.modeling_marian.MarianSinusoidalPositionalEmbedding with Marian->RoFormer
+class RoFormerSinusoidalPositionalEmbedding(nn.Embedding):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None) -> None:
+        super().__init__(num_positions, embedding_dim)
+
+    def _init_weight(self):
+        """
+        Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
+        the 2nd half of the vector. [dim // 2:]
+        """
+        n_pos, dim = self.weight.shape
+        position_enc = np.array(
+            [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
+        )
+        out = torch.empty(n_pos, dim, dtype=self.weight.dtype, requires_grad=False)
+        sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
+        out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
+        out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
+        self.weight = nn.Parameter(out, requires_grad=False)
+
+    @torch.no_grad()
+    def forward(
+        self, input_ids_shape: torch.Size, past_key_values_length: int = 0, position_ids: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        if position_ids is None:
+            bsz, seq_len = input_ids_shape[:2]
+            position_ids = torch.arange(
+                past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
+            )
+        return super().forward(position_ids)
+
+
+def load_tf_weights_in_roformer(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name.replace("bert", "roformer"))
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ):
+            logger.info(f"Skipping {'/'.join(name)}")
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            if not pointer.shape == array.shape:
+                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f"Initialize PyTorch weight {name}")
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+class RoFormerEmbeddings(nn.Module):
+    """Construct the embeddings from word and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, input_ids=None, token_type_ids=None, inputs_embeds=None):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=inputs_embeds.device)
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class RoFormerSelfAttention(nn.Module):
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+        self.is_decoder = config.is_decoder
+        self.rotary_value = config.rotary_value
+        self.layer_idx = layer_idx
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        sinusoidal_pos=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        past_key_values=None,
+        output_attentions=False,
+        cache_position=None,
+    ):
+        batch_size, seq_length, _ = hidden_states.shape
+        query_layer = (
+            self.query(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        is_updated = False
+        if past_key_values is not None:
+            if isinstance(past_key_values, EncoderDecoderCache):
+                is_updated = past_key_values.is_updated.get(self.layer_idx)
+                if is_cross_attention:
+                    # after the first generated id, we can subsequently re-use all key/value_layer from cache
+                    curr_past_key_value = past_key_values.cross_attention_cache
+                else:
+                    curr_past_key_value = past_key_values.self_attention_cache
+            else:
+                curr_past_key_value = past_key_values
+
+        current_states = encoder_hidden_states if is_cross_attention else hidden_states
+        if is_cross_attention and past_key_values is not None and is_updated:
+            # reuse k,v, cross_attentions
+            key_layer = curr_past_key_value.layers[self.layer_idx].keys
+            value_layer = curr_past_key_value.layers[self.layer_idx].values
+        else:
+            key_layer = (
+                self.key(current_states)
+                .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+                .transpose(1, 2)
+            )
+            value_layer = (
+                self.value(current_states)
+                .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+                .transpose(1, 2)
+            )
+
+            # Apply RoPE if self attention
+            if not is_cross_attention and sinusoidal_pos is not None:
+                if self.rotary_value:
+                    query_layer, key_layer, value_layer = self.apply_rotary_position_embeddings(
+                        sinusoidal_pos, query_layer, key_layer, value_layer
+                    )
+                else:
+                    query_layer, key_layer = self.apply_rotary_position_embeddings(
+                        sinusoidal_pos, query_layer, key_layer
+                    )
+
+            if past_key_values is not None:
+                # save all key/value_layer to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_layer, value_layer = curr_past_key_value.update(
+                    key_layer, value_layer, self.layer_idx, {"cache_position": cache_position}
+                )
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache):
+                    past_key_values.is_updated[self.layer_idx] = True
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in RoFormerModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        return context_layer, attention_probs
+
+    @staticmethod
+    def apply_rotary_position_embeddings(sinusoidal_pos, query_layer, key_layer, value_layer=None):
+        # https://kexue.fm/archives/8265
+        # sin [batch_size, num_heads, sequence_length, embed_size_per_head//2]
+        # cos [batch_size, num_heads, sequence_length, embed_size_per_head//2]
+        sin, cos = sinusoidal_pos.chunk(2, dim=-1)
+        # sin [θ0,θ1,θ2......θd/2-1] -> sin_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1]
+        sin_pos = torch.stack([sin, sin], dim=-1).reshape_as(sinusoidal_pos)
+        # cos [θ0,θ1,θ2......θd/2-1] -> cos_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1]
+        cos_pos = torch.stack([cos, cos], dim=-1).reshape_as(sinusoidal_pos)
+        # rotate_half_query_layer [-q1,q0,-q3,q2......,-qd-1,qd-2]
+        rotate_half_query_layer = torch.stack([-query_layer[..., 1::2], query_layer[..., ::2]], dim=-1).reshape_as(
+            query_layer
+        )
+        query_layer = query_layer * cos_pos + rotate_half_query_layer * sin_pos
+        # rotate_half_key_layer [-k1,k0,-k3,k2......,-kd-1,kd-2]
+        rotate_half_key_layer = torch.stack([-key_layer[..., 1::2], key_layer[..., ::2]], dim=-1).reshape_as(key_layer)
+        key_layer = key_layer * cos_pos + rotate_half_key_layer * sin_pos
+        if value_layer is not None:
+            # rotate_half_value_layer [-v1,v0,-v3,v2......,-vd-1,vd-2]
+            rotate_half_value_layer = torch.stack([-value_layer[..., 1::2], value_layer[..., ::2]], dim=-1).reshape_as(
+                value_layer
+            )
+            value_layer = value_layer * cos_pos + rotate_half_value_layer * sin_pos
+            return query_layer, key_layer, value_layer
+        return query_layer, key_layer
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->RoFormer
+class RoFormerSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class RoFormerAttention(nn.Module):
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+        self.self = RoFormerSelfAttention(config, layer_idx=layer_idx)
+        self.output = RoFormerSelfOutput(config)
+        self.pruned_heads = set()
+
+    # Copied from transformers.models.bert.modeling_bert.BertAttention.prune_heads
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        sinusoidal_pos=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        past_key_values=None,
+        output_attentions=False,
+        cache_position=None,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask=attention_mask,
+            sinusoidal_pos=sinusoidal_pos,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->RoFormer
+class RoFormerIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->RoFormer
+class RoFormerOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class RoFormerLayer(GradientCheckpointingLayer):
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = RoFormerAttention(config, layer_idx)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = RoFormerAttention(config, layer_idx)
+        self.intermediate = RoFormerIntermediate(config)
+        self.output = RoFormerOutput(config)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        sinusoidal_pos=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        output_attentions=False,
+        cache_position=None,
+    ):
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            sinusoidal_pos=sinusoidal_pos,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            past_key_values=past_key_values,
+            cache_position=cache_position,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention "
+                    "layers by setting `config.add_cross_attention=True`"
+                )
+
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask=encoder_attention_mask,
+                sinusoidal_pos=sinusoidal_pos,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:]  # add cross attentions if we output attention weights
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        return (layer_output,) + outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class RoFormerEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_positions = RoFormerSinusoidalPositionalEmbedding(
+            config.max_position_embeddings, config.hidden_size // config.num_attention_heads
+        )
+        self.layer = nn.ModuleList([RoFormerLayer(config, layer_idx=i) for i in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        cache_position=None,
+    ):
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if use_cache and past_key_values is None:
+            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
+        if use_cache and isinstance(past_key_values, tuple):
+            logger.warning_once(
+                "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. "
+                "You should pass an instance of `EncoderDecoderCache` instead, e.g. "
+                "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`."
+            )
+            past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
+
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+
+        # [sequence_length, embed_size_per_head] -> [batch_size, num_heads, sequence_length, embed_size_per_head]
+        sinusoidal_pos = self.embed_positions(hidden_states.shape[:-1], past_key_values_length)[None, None, :, :]
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                sinusoidal_pos,
+                layer_head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                past_key_values,
+                output_attentions,
+                cache_position,
+            )
+
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    past_key_values,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.xlm.modeling_xlm.XLMSequenceSummary with XLM->RoFormer
+class RoFormerSequenceSummary(nn.Module):
+    r"""
+    Compute a single vector summary of a sequence hidden states.
+
+    Args:
+        config ([`RoFormerConfig`]):
+            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
+            config class of your model for the default values it uses):
+
+            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:
+
+                - `"last"` -- Take the last token hidden state (like XLNet)
+                - `"first"` -- Take the first token hidden state (like Bert)
+                - `"mean"` -- Take the mean of all tokens hidden states
+                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
+                - `"attn"` -- Not implemented now, use multi-head attention
+
+            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
+            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
+              (otherwise to `config.hidden_size`).
+            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
+              another string or `None` will add no activation.
+            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
+            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
+    """
+
+    def __init__(self, config: RoFormerConfig):
+        super().__init__()
+
+        self.summary_type = getattr(config, "summary_type", "last")
+        if self.summary_type == "attn":
+            # We should use a standard multi-head attention module with absolute positional embedding for that.
+            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
+            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
+            raise NotImplementedError
+
+        self.summary = nn.Identity()
+        if hasattr(config, "summary_use_proj") and config.summary_use_proj:
+            if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
+                num_classes = config.num_labels
+            else:
+                num_classes = config.hidden_size
+            self.summary = nn.Linear(config.hidden_size, num_classes)
+
+        activation_string = getattr(config, "summary_activation", None)
+        self.activation: Callable = get_activation(activation_string) if activation_string else nn.Identity()
+
+        self.first_dropout = nn.Identity()
+        if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0:
+            self.first_dropout = nn.Dropout(config.summary_first_dropout)
+
+        self.last_dropout = nn.Identity()
+        if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0:
+            self.last_dropout = nn.Dropout(config.summary_last_dropout)
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None
+    ) -> torch.FloatTensor:
+        """
+        Compute a single vector summary of a sequence hidden states.
+
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
+                The hidden states of the last layer.
+            cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
+                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.
+
+        Returns:
+            `torch.FloatTensor`: The summary of the sequence hidden states.
+        """
+        if self.summary_type == "last":
+            output = hidden_states[:, -1]
+        elif self.summary_type == "first":
+            output = hidden_states[:, 0]
+        elif self.summary_type == "mean":
+            output = hidden_states.mean(dim=1)
+        elif self.summary_type == "cls_index":
+            if cls_index is None:
+                cls_index = torch.full_like(
+                    hidden_states[..., :1, :],
+                    hidden_states.shape[-2] - 1,
+                    dtype=torch.long,
+                )
+            else:
+                cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
+                cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),))
+            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
+            output = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, XX, hidden_size)
+        elif self.summary_type == "attn":
+            raise NotImplementedError
+
+        output = self.first_dropout(output)
+        output = self.summary(output)
+        output = self.activation(output)
+        output = self.last_dropout(output)
+
+        return output
+
+
+class RoFormerPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.embedding_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class RoFormerLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = RoFormerPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.embedding_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def _tie_weights(self) -> None:
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->RoFormer
+class RoFormerOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = RoFormerLMPredictionHead(config)
+
+    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+@auto_docstring
+class RoFormerPreTrainedModel(PreTrainedModel):
+    config: RoFormerConfig
+    load_tf_weights = load_tf_weights_in_roformer
+    base_model_prefix = "roformer"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, RoFormerSinusoidalPositionalEmbedding):
+            module._init_weight()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, RoFormerLMPredictionHead):
+            module.bias.data.zero_()
+
+
+@auto_docstring(
+    custom_intro="""
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+)
+class RoFormerModel(RoFormerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = RoFormerEmbeddings(config)
+
+        if config.embedding_size != config.hidden_size:
+            self.embeddings_project = nn.Linear(config.embedding_size, config.hidden_size)
+
+        self.encoder = RoFormerEncoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[BaseModelOutputWithPastAndCrossAttentions, tuple[torch.Tensor]]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = (
+                past_key_values[0][0].shape[-2]
+                if not isinstance(past_key_values, Cache)
+                else past_key_values.get_seq_length()
+            )
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )
+        if hasattr(self, "embeddings_project"):
+            embedding_output = self.embeddings_project(embedding_output)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        sequence_output = encoder_outputs[0]
+
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@auto_docstring
+class RoFormerForMaskedLM(RoFormerPreTrainedModel):
+    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `RoFormerForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.roformer = RoFormerModel(config)
+        self.cls = RoFormerOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+        self.cls.predictions.bias = new_embeddings.bias
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[MaskedLMOutput, tuple[torch.Tensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+
+        #  add a dummy token
+        assert self.config.pad_token_id is not None, "The PAD token should be defined for generation"
+        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
+        dummy_token = torch.full(
+            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
+        )
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask}
+
+
+@auto_docstring(
+    custom_intro="""
+    RoFormer Model with a `language modeling` head on top for CLM fine-tuning.
+    """
+)
+class RoFormerForCausalLM(RoFormerPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning("If you want to use `RoFormerForCausalLM` as a standalone, add `is_decoder=True.`")
+
+        self.roformer = RoFormerModel(config)
+        self.cls = RoFormerOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+        self.cls.predictions.bias = new_embeddings.bias
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[CausalLMOutputWithCrossAttentions, tuple[torch.Tensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, RoFormerForCausalLM, RoFormerConfig
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("junnyu/roformer_chinese_base")
+        >>> config = RoFormerConfig.from_pretrained("junnyu/roformer_chinese_base")
+        >>> config.is_decoder = True
+        >>> model = RoFormerForCausalLM.from_pretrained("junnyu/roformer_chinese_base", config=config)
+
+        >>> inputs = tokenizer("今天天气非常好。", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            lm_loss = self.loss_function(
+                prediction_scores,
+                labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[1:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+
+class RoFormerClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.config = config
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = ACT2FN[self.config.hidden_act](x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+@auto_docstring(
+    custom_intro="""
+    RoFormer Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """
+)
+class RoFormerForSequenceClassification(RoFormerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.roformer = RoFormerModel(config)
+        self.classifier = RoFormerClassificationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[SequenceClassifierOutput, tuple[torch.Tensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+class RoFormerForMultipleChoice(RoFormerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.roformer = RoFormerModel(config)
+        self.sequence_summary = RoFormerSequenceSummary(config)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[MultipleChoiceModelOutput, tuple[torch.Tensor]]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.roformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        pooled_output = self.sequence_summary(sequence_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+class RoFormerForTokenClassification(RoFormerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.roformer = RoFormerModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[TokenClassifierOutput, tuple[torch.Tensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+class RoFormerForQuestionAnswering(RoFormerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        config.num_labels = 2
+        self.num_labels = config.num_labels
+
+        self.roformer = RoFormerModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[QuestionAnsweringModelOutput, tuple[torch.Tensor]]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[1:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "RoFormerForCausalLM",
+    "RoFormerForMaskedLM",
+    "RoFormerForMultipleChoice",
+    "RoFormerForQuestionAnswering",
+    "RoFormerForSequenceClassification",
+    "RoFormerForTokenClassification",
+    "RoFormerLayer",
+    "RoFormerModel",
+    "RoFormerPreTrainedModel",
+    "load_tf_weights_in_roformer",
+]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/roformer/modeling_tf_roformer.py b/venv/lib/python3.13/site-packages/transformers/models/roformer/modeling_tf_roformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e07374e9fdf55f553750d7b489ad978cd3a8d550
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/roformer/modeling_tf_roformer.py
@@ -0,0 +1,1546 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0 RoFormer model."""
+
+from __future__ import annotations
+
+import math
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import (
+    TFBaseModelOutput,
+    TFBaseModelOutputWithPooling,
+    TFCausalLMOutput,
+    TFMaskedLMOutput,
+    TFMultipleChoiceModelOutput,
+    TFQuestionAnsweringModelOutput,
+    TFSequenceClassifierOutput,
+    TFTokenClassifierOutput,
+)
+from ...modeling_tf_utils import (
+    TFCausalLanguageModelingLoss,
+    TFMaskedLanguageModelingLoss,
+    TFModelInputType,
+    TFMultipleChoiceLoss,
+    TFPreTrainedModel,
+    TFQuestionAnsweringLoss,
+    TFSequenceClassificationLoss,
+    TFSequenceSummary,
+    TFTokenClassificationLoss,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_roformer import RoFormerConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "junnyu/roformer_chinese_base"
+_CONFIG_FOR_DOC = "RoFormerConfig"
+
+
+class TFRoFormerSinusoidalPositionalEmbedding(keras.layers.Layer):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    def __init__(self, num_positions: int, embedding_dim: int, **kwargs):
+        super().__init__(**kwargs)
+
+        if embedding_dim % 2 != 0:
+            raise NotImplementedError(f"odd embedding_dim {embedding_dim} not supported")
+
+        self.embedding_dim = embedding_dim
+        self.num_positions = num_positions
+
+    def build(self, input_shape: tf.TensorShape):
+        """
+        Build shared token embedding layer Shared weights logic adapted from
+        https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
+        """
+
+        weight = self._init_weight(self.num_positions, self.embedding_dim)
+
+        self.weight = self.add_weight(
+            name="embeddings",
+            shape=[self.num_positions, self.embedding_dim],
+        )
+        weight = tf.cast(weight, dtype=self.weight.dtype)
+
+        self.weight.assign(weight)
+
+        super().build(input_shape)
+
+    @staticmethod
+    def _init_weight(n_pos: int, dim: int):
+        """
+        Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
+        the 2nd half of the vector. [dim // 2:]
+        """
+        position_enc = np.array(
+            [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
+        )
+        table = np.zeros_like(position_enc)
+        # index 0 is all zero
+        table[:, 0 : dim // 2] = np.sin(position_enc[:, 0::2])
+        table[:, dim // 2 :] = np.cos(position_enc[:, 1::2])
+        # convert to tensor
+        table = tf.convert_to_tensor(table)
+        tf.stop_gradient(table)
+        return table
+
+    def call(self, input_shape: tf.TensorShape, past_key_values_length: int = 0):
+        """Input is expected to be of size [bsz x seqlen]."""
+        bsz, seq_len = input_shape[:2]
+
+        positions = tf.range(past_key_values_length, seq_len + past_key_values_length, delta=1, name="range")
+        return tf.gather(self.weight, positions)
+
+
+class TFRoFormerEmbeddings(keras.layers.Layer):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config: RoFormerConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        self.embedding_size = config.embedding_size
+        self.initializer_range = config.initializer_range
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+
+    def build(self, input_shape=None):
+        with tf.name_scope("word_embeddings"):
+            self.weight = self.add_weight(
+                name="weight",
+                shape=[self.config.vocab_size, self.embedding_size],
+                initializer=get_initializer(self.initializer_range),
+            )
+
+        with tf.name_scope("token_type_embeddings"):
+            self.token_type_embeddings = self.add_weight(
+                name="embeddings",
+                shape=[self.config.type_vocab_size, self.embedding_size],
+                initializer=get_initializer(self.initializer_range),
+            )
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.embedding_size])
+
+    def call(
+        self,
+        input_ids: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        training: bool = False,
+    ) -> tf.Tensor:
+        """
+        Applies embedding based on inputs tensor.
+
+
+        Returns:
+            final_embeddings (`tf.Tensor`): output embedding tensor.
+        """
+        assert not (input_ids is None and inputs_embeds is None)
+
+        if input_ids is not None:
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
+            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
+
+        input_shape = shape_list(inputs_embeds)[:-1]
+
+        if token_type_ids is None:
+            token_type_ids = tf.fill(dims=input_shape, value=0)
+
+        token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
+        final_embeddings = inputs_embeds + token_type_embeds
+        final_embeddings = self.LayerNorm(inputs=final_embeddings)
+        final_embeddings = self.dropout(inputs=final_embeddings, training=training)
+
+        return final_embeddings
+
+
+class TFRoFormerSelfAttention(keras.layers.Layer):
+    def __init__(self, config: RoFormerConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number "
+                f"of attention heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
+
+        self.query = keras.layers.Dense(
+            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
+        )
+        self.key = keras.layers.Dense(
+            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
+        )
+        self.value = keras.layers.Dense(
+            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
+        )
+        self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
+        self.rotary_value = config.rotary_value
+        self.config = config
+
+    def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
+        # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
+        tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
+
+        # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size]
+        return tf.transpose(tensor, perm=[0, 2, 1, 3])
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        sinusoidal_pos: tf.Tensor,
+        head_mask: tf.Tensor,
+        output_attentions: bool,
+        training: bool = False,
+    ) -> tuple[tf.Tensor]:
+        batch_size = shape_list(hidden_states)[0]
+        mixed_query_layer = self.query(inputs=hidden_states)
+        mixed_key_layer = self.key(inputs=hidden_states)
+        mixed_value_layer = self.value(inputs=hidden_states)
+        query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
+        key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
+        value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
+
+        if sinusoidal_pos is not None:
+            if self.rotary_value:
+                query_layer, key_layer, value_layer = self.apply_rotary_position_embeddings(
+                    sinusoidal_pos, query_layer, key_layer, value_layer
+                )
+            else:
+                query_layer, key_layer = self.apply_rotary_position_embeddings(sinusoidal_pos, query_layer, key_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        # (batch size, num_heads, seq_len_q, seq_len_k)
+        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
+        dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
+        attention_scores = tf.divide(attention_scores, dk)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in TFRoFormerModel call() function)
+            attention_scores = tf.add(attention_scores, attention_mask)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = stable_softmax(logits=attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(inputs=attention_probs, training=training)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = tf.multiply(attention_probs, head_mask)
+
+        attention_output = tf.matmul(attention_probs, value_layer)
+        attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])
+
+        # (batch_size, seq_len_q, all_head_size)
+        attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size))
+        outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
+
+        return outputs
+
+    @staticmethod
+    def apply_rotary_position_embeddings(sinusoidal_pos, query_layer, key_layer, value_layer=None):
+        # https://kexue.fm/archives/8265
+        # sin [batch_size, num_heads, sequence_length, embed_size_per_head//2]
+        # cos [batch_size, num_heads, sequence_length, embed_size_per_head//2]
+        sin, cos = tf.split(sinusoidal_pos, num_or_size_splits=2, axis=-1)
+        # sin [θ0,θ1,θ2......θd/2-1]-> sin_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1]
+        # cos [θ0,θ1,θ2......θd/2-1]-> cos_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1]
+        sin_pos = tf.repeat(sin, 2, axis=-1)
+        cos_pos = tf.repeat(cos, 2, axis=-1)
+        # rotate_half_query_layer [-q1,q0,-q3,q2......,-qd-1,qd-2]
+        rotate_half_query_layer = tf.stack([-query_layer[..., 1::2], query_layer[..., ::2]], axis=-1)
+        rotate_half_query_layer = tf.reshape(rotate_half_query_layer, shape_list(query_layer))
+        query_layer = query_layer * cos_pos + rotate_half_query_layer * sin_pos
+        # rotate_half_key_layer [-k1,k0,-k3,k2......,-kd-1,kd-2]
+        rotate_half_key_layer = tf.stack([-key_layer[..., 1::2], key_layer[..., ::2]], axis=-1)
+        rotate_half_key_layer = tf.reshape(rotate_half_key_layer, shape_list(key_layer))
+        key_layer = key_layer * cos_pos + rotate_half_key_layer * sin_pos
+        if value_layer is not None:
+            # rotate_half_value_layer [-v1,v0,-v3,v2......,-vd-1,vd-2]
+            rotate_half_value_layer = tf.stack([-value_layer[..., 1::2], value_layer[..., ::2]], axis=-1)
+            rotate_half_value_layer = tf.reshape(rotate_half_value_layer, shape_list(value_layer))
+            value_layer = value_layer * cos_pos + rotate_half_value_layer * sin_pos
+            return query_layer, key_layer, value_layer
+        return query_layer, key_layer
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "query", None) is not None:
+            with tf.name_scope(self.query.name):
+                self.query.build([None, None, self.config.hidden_size])
+        if getattr(self, "key", None) is not None:
+            with tf.name_scope(self.key.name):
+                self.key.build([None, None, self.config.hidden_size])
+        if getattr(self, "value", None) is not None:
+            with tf.name_scope(self.value.name):
+                self.value.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->RoFormer
+class TFRoFormerSelfOutput(keras.layers.Layer):
+    def __init__(self, config: RoFormerConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+
+
+class TFRoFormerAttention(keras.layers.Layer):
+    def __init__(self, config: RoFormerConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.self_attention = TFRoFormerSelfAttention(config, name="self")
+        self.dense_output = TFRoFormerSelfOutput(config, name="output")
+
+    def prune_heads(self, heads):
+        raise NotImplementedError
+
+    def call(
+        self,
+        input_tensor: tf.Tensor,
+        attention_mask: tf.Tensor,
+        sinusoidal_pos: tf.Tensor,
+        head_mask: tf.Tensor,
+        output_attentions: bool,
+        training: bool = False,
+    ) -> tuple[tf.Tensor]:
+        self_outputs = self.self_attention(
+            hidden_states=input_tensor,
+            attention_mask=attention_mask,
+            sinusoidal_pos=sinusoidal_pos,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            training=training,
+        )
+        attention_output = self.dense_output(
+            hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
+        )
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attention", None) is not None:
+            with tf.name_scope(self.self_attention.name):
+                self.self_attention.build(None)
+        if getattr(self, "dense_output", None) is not None:
+            with tf.name_scope(self.dense_output.name):
+                self.dense_output.build(None)
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->RoFormer
+class TFRoFormerIntermediate(keras.layers.Layer):
+    def __init__(self, config: RoFormerConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
+        else:
+            self.intermediate_act_fn = config.hidden_act
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->RoFormer
+class TFRoFormerOutput(keras.layers.Layer):
+    def __init__(self, config: RoFormerConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.intermediate_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+
+
+class TFRoFormerLayer(keras.layers.Layer):
+    def __init__(self, config: RoFormerConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.attention = TFRoFormerAttention(config, name="attention")
+        self.intermediate = TFRoFormerIntermediate(config, name="intermediate")
+        self.roformer_output = TFRoFormerOutput(config, name="output")
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        sinusoidal_pos: tf.Tensor,
+        head_mask: tf.Tensor,
+        output_attentions: bool,
+        training: bool = False,
+    ) -> tuple[tf.Tensor]:
+        attention_outputs = self.attention(
+            input_tensor=hidden_states,
+            attention_mask=attention_mask,
+            sinusoidal_pos=sinusoidal_pos,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            training=training,
+        )
+        attention_output = attention_outputs[0]
+        intermediate_output = self.intermediate(hidden_states=attention_output)
+        layer_output = self.roformer_output(
+            hidden_states=intermediate_output, input_tensor=attention_output, training=training
+        )
+        outputs = (layer_output,) + attention_outputs[1:]  # add attentions if we output them
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "intermediate", None) is not None:
+            with tf.name_scope(self.intermediate.name):
+                self.intermediate.build(None)
+        if getattr(self, "roformer_output", None) is not None:
+            with tf.name_scope(self.roformer_output.name):
+                self.roformer_output.build(None)
+
+
+class TFRoFormerEncoder(keras.layers.Layer):
+    def __init__(self, config: RoFormerConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.embed_positions = TFRoFormerSinusoidalPositionalEmbedding(
+            config.max_position_embeddings,
+            config.hidden_size // config.num_attention_heads,
+            name="embed_positions",
+        )
+        self.layer = [TFRoFormerLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        head_mask: tf.Tensor,
+        output_attentions: bool,
+        output_hidden_states: bool,
+        return_dict: bool,
+        training: bool = False,
+    ) -> TFBaseModelOutput | tuple[tf.Tensor]:
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # [sequence_length, embed_size_per_head] -> [batch_size, num_heads, sequence_length, embed_size_per_head]
+        sinusoidal_pos = self.embed_positions(shape_list(hidden_states)[:-1])[None, None, :, :]
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = layer_module(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                sinusoidal_pos=sinusoidal_pos,
+                head_mask=head_mask[i],
+                output_attentions=output_attentions,
+                training=training,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
+
+        return TFBaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embed_positions", None) is not None:
+            with tf.name_scope(self.embed_positions.name):
+                self.embed_positions.build(None)
+        if getattr(self, "layer", None) is not None:
+            for layer in self.layer:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+class TFRoFormerPredictionHeadTransform(keras.layers.Layer):
+    def __init__(self, config: RoFormerConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.embedding_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="dense",
+        )
+
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = get_tf_activation(config.hidden_act)
+        else:
+            self.transform_act_fn = config.hidden_act
+
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(inputs=hidden_states)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.embedding_size])
+
+
+class TFRoFormerLMPredictionHead(keras.layers.Layer):
+    def __init__(self, config: RoFormerConfig, input_embeddings: keras.layers.Layer, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        self.embedding_size = config.embedding_size
+
+        self.transform = TFRoFormerPredictionHeadTransform(config, name="transform")
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.input_embeddings = input_embeddings
+
+    def build(self, input_shape=None):
+        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transform", None) is not None:
+            with tf.name_scope(self.transform.name):
+                self.transform.build(None)
+
+    def get_output_embeddings(self) -> keras.layers.Layer:
+        return self.input_embeddings
+
+    def set_output_embeddings(self, value: tf.Variable):
+        self.input_embeddings.weight = value
+        self.input_embeddings.vocab_size = shape_list(value)[0]
+
+    def get_bias(self) -> dict[str, tf.Variable]:
+        return {"bias": self.bias}
+
+    def set_bias(self, value: tf.Variable):
+        self.bias = value["bias"]
+        self.config.vocab_size = shape_list(value["bias"])[0]
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.transform(hidden_states=hidden_states)
+        seq_length = shape_list(hidden_states)[1]
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size])
+        hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
+        hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
+
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertMLMHead with Bert->RoFormer
+class TFRoFormerMLMHead(keras.layers.Layer):
+    def __init__(self, config: RoFormerConfig, input_embeddings: keras.layers.Layer, **kwargs):
+        super().__init__(**kwargs)
+
+        self.predictions = TFRoFormerLMPredictionHead(config, input_embeddings, name="predictions")
+
+    def call(self, sequence_output: tf.Tensor) -> tf.Tensor:
+        prediction_scores = self.predictions(hidden_states=sequence_output)
+
+        return prediction_scores
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "predictions", None) is not None:
+            with tf.name_scope(self.predictions.name):
+                self.predictions.build(None)
+
+
+@keras_serializable
+class TFRoFormerMainLayer(keras.layers.Layer):
+    config_class = RoFormerConfig
+
+    def __init__(self, config: RoFormerConfig, add_pooling_layer: bool = True, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+
+        self.embeddings = TFRoFormerEmbeddings(config, name="embeddings")
+        if config.embedding_size != config.hidden_size:
+            self.embeddings_project = keras.layers.Dense(config.hidden_size, name="embeddings_project")
+
+        self.encoder = TFRoFormerEncoder(config, name="encoder")
+
+    def get_input_embeddings(self) -> keras.layers.Layer:
+        return self.embeddings
+
+    def set_input_embeddings(self, value: tf.Variable):
+        self.embeddings.weight = value
+        self.embeddings.vocab_size = shape_list(value)[0]
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        raise NotImplementedError
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> TFBaseModelOutput | tuple[tf.Tensor]:
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            input_shape = shape_list(inputs_embeds)[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if attention_mask is None:
+            attention_mask = tf.fill(dims=input_shape, value=1)
+
+        if token_type_ids is None:
+            token_type_ids = tf.fill(dims=input_shape, value=0)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            training=training,
+        )
+        if hasattr(self, "embeddings_project"):
+            embedding_output = self.embeddings_project(embedding_output, training=training)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = tf.reshape(attention_mask, (input_shape[0], 1, 1, input_shape[1]))
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = tf.cast(extended_attention_mask, dtype=embedding_output.dtype)
+        one_cst = tf.constant(1.0, dtype=embedding_output.dtype)
+        ten_thousand_cst = tf.constant(-10000.0, dtype=embedding_output.dtype)
+        extended_attention_mask = tf.multiply(tf.subtract(one_cst, extended_attention_mask), ten_thousand_cst)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
+        encoder_outputs = self.encoder(
+            hidden_states=embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = encoder_outputs[0]
+
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[1:]
+
+        return TFBaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "embeddings_project", None) is not None:
+            with tf.name_scope(self.embeddings_project.name):
+                self.embeddings_project.build([None, None, self.config.embedding_size])
+
+
+class TFRoFormerPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = RoFormerConfig
+    base_model_prefix = "roformer"
+
+
+ROFORMER_START_DOCSTRING = r"""
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TensorFlow models and layers in `transformers` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Args:
+        config ([`RoFormerConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+ROFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`np.ndarray`, `tf.Tensor`, `list[tf.Tensor]` ``dict[str, tf.Tensor]` or `dict[str, np.ndarray]` and each example must have the shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            [`PreTrainedTokenizer.encode`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        head_mask (`np.ndarray` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False``):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+
+@add_start_docstrings(
+    "The bare RoFormer Model transformer outputting raw hidden-states without any specific head on top.",
+    ROFORMER_START_DOCSTRING,
+)
+class TFRoFormerModel(TFRoFormerPreTrainedModel):
+    def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.roformer = TFRoFormerMainLayer(config, name="roformer")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutputWithPooling,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> TFBaseModelOutputWithPooling | tuple[tf.Tensor]:
+        outputs = self.roformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roformer", None) is not None:
+            with tf.name_scope(self.roformer.name):
+                self.roformer.build(None)
+
+
+@add_start_docstrings("""RoFormer Model with a `language modeling` head on top.""", ROFORMER_START_DOCSTRING)
+class TFRoFormerForMaskedLM(TFRoFormerPreTrainedModel, TFMaskedLanguageModelingLoss):
+    def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `TFRoFormerForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.roformer = TFRoFormerMainLayer(config, name="roformer")
+        self.mlm = TFRoFormerMLMHead(config, input_embeddings=self.roformer.embeddings, name="mlm___cls")
+
+    def get_lm_head(self) -> keras.layers.Layer:
+        return self.mlm.predictions
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFMaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFMaskedLMOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        outputs = self.roformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.mlm(sequence_output=sequence_output, training=training)
+        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=prediction_scores)
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFMaskedLMOutput(
+            loss=loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roformer", None) is not None:
+            with tf.name_scope(self.roformer.name):
+                self.roformer.build(None)
+        if getattr(self, "mlm", None) is not None:
+            with tf.name_scope(self.mlm.name):
+                self.mlm.build(None)
+
+
+@add_start_docstrings(
+    """RoFormer Model with a `language modeling` head on top for CLM fine-tuning.""", ROFORMER_START_DOCSTRING
+)
+class TFRoFormerForCausalLM(TFRoFormerPreTrainedModel, TFCausalLanguageModelingLoss):
+    def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        if not config.is_decoder:
+            logger.warning("If you want to use `TFRoFormerForCausalLM` as a standalone, add `is_decoder=True.`")
+
+        self.roformer = TFRoFormerMainLayer(config, name="roformer")
+        self.mlm = TFRoFormerMLMHead(config, input_embeddings=self.roformer.embeddings, name="mlm___cls")
+
+    def get_lm_head(self) -> keras.layers.Layer:
+        return self.mlm.predictions
+
+    @unpack_inputs
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFCausalLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFCausalLMOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+        outputs = self.roformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = outputs[0]
+        logits = self.mlm(sequence_output=sequence_output, training=training)
+        loss = None
+
+        if labels is not None:
+            # shift labels to the left and cut last logit token
+            shifted_logits = logits[:, :-1]
+            labels = labels[:, 1:]
+            loss = self.hf_compute_loss(labels=labels, logits=shifted_logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFCausalLMOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roformer", None) is not None:
+            with tf.name_scope(self.roformer.name):
+                self.roformer.build(None)
+        if getattr(self, "mlm", None) is not None:
+            with tf.name_scope(self.mlm.name):
+                self.mlm.build(None)
+
+
+class TFRoFormerClassificationHead(keras.layers.Layer):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.out_proj = keras.layers.Dense(
+            units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
+        )
+
+        if isinstance(config.hidden_act, str):
+            self.classifier_act_fn = get_tf_activation(config.hidden_act)
+        else:
+            self.classifier_act_fn = config.hidden_act
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = hidden_states[:, 0, :]  # take <s> token (equiv. to [CLS])
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.classifier_act_fn(hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+        hidden_states = self.out_proj(hidden_states)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "out_proj", None) is not None:
+            with tf.name_scope(self.out_proj.name):
+                self.out_proj.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+    """
+    RoFormer Model transformer with a sequence classification/regression head on top e.g., for GLUE tasks.
+    """,
+    ROFORMER_START_DOCSTRING,
+)
+class TFRoFormerForSequenceClassification(TFRoFormerPreTrainedModel, TFSequenceClassificationLoss):
+    def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.num_labels = config.num_labels
+
+        self.roformer = TFRoFormerMainLayer(config, name="roformer")
+        self.classifier = TFRoFormerClassificationHead(config, name="classifier")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFSequenceClassifierOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        outputs = self.roformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        logits = self.classifier(hidden_states=outputs[0], training=training)
+        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+
+            return ((loss,) + output) if loss is not None else output
+
+        return TFSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roformer", None) is not None:
+            with tf.name_scope(self.roformer.name):
+                self.roformer.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build(None)
+
+
+@add_start_docstrings(
+    """
+    RoFormer Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    ROFORMER_START_DOCSTRING,
+)
+class TFRoFormerForMultipleChoice(TFRoFormerPreTrainedModel, TFMultipleChoiceLoss):
+    def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.roformer = TFRoFormerMainLayer(config, name="roformer")
+        self.sequence_summary = TFSequenceSummary(config, config.initializer_range, name="sequence_summary")
+        self.classifier = keras.layers.Dense(
+            units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(
+        ROFORMER_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+    )
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFMultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFMultipleChoiceModelOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
+            where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
+        """
+        if input_ids is not None:
+            num_choices = shape_list(input_ids)[1]
+            seq_length = shape_list(input_ids)[2]
+        else:
+            num_choices = shape_list(inputs_embeds)[1]
+            seq_length = shape_list(inputs_embeds)[2]
+
+        flat_input_ids = tf.reshape(tensor=input_ids, shape=(-1, seq_length)) if input_ids is not None else None
+        flat_attention_mask = (
+            tf.reshape(tensor=attention_mask, shape=(-1, seq_length)) if attention_mask is not None else None
+        )
+        flat_token_type_ids = (
+            tf.reshape(tensor=token_type_ids, shape=(-1, seq_length)) if token_type_ids is not None else None
+        )
+        flat_inputs_embeds = (
+            tf.reshape(tensor=inputs_embeds, shape=(-1, seq_length, shape_list(inputs_embeds)[3]))
+            if inputs_embeds is not None
+            else None
+        )
+        outputs = self.roformer(
+            input_ids=flat_input_ids,
+            attention_mask=flat_attention_mask,
+            token_type_ids=flat_token_type_ids,
+            head_mask=head_mask,
+            inputs_embeds=flat_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        logits = self.sequence_summary(inputs=outputs[0], training=training)
+        logits = self.classifier(inputs=logits)
+        reshaped_logits = tf.reshape(tensor=logits, shape=(-1, num_choices))
+        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=reshaped_logits)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[1:]
+
+            return ((loss,) + output) if loss is not None else output
+
+        return TFMultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roformer", None) is not None:
+            with tf.name_scope(self.roformer.name):
+                self.roformer.build(None)
+        if getattr(self, "sequence_summary", None) is not None:
+            with tf.name_scope(self.sequence_summary.name):
+                self.sequence_summary.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+    """
+    RoFormer Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    ROFORMER_START_DOCSTRING,
+)
+class TFRoFormerForTokenClassification(TFRoFormerPreTrainedModel, TFTokenClassificationLoss):
+    def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.num_labels = config.num_labels
+
+        self.roformer = TFRoFormerMainLayer(config, name="roformer")
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.classifier = keras.layers.Dense(
+            units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFTokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFTokenClassifierOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        outputs = self.roformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(inputs=sequence_output, training=training)
+        logits = self.classifier(inputs=sequence_output)
+        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFTokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roformer", None) is not None:
+            with tf.name_scope(self.roformer.name):
+                self.roformer.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+    """
+    RoFormer Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    ROFORMER_START_DOCSTRING,
+)
+class TFRoFormerForQuestionAnswering(TFRoFormerPreTrainedModel, TFQuestionAnsweringLoss):
+    def __init__(self, config: RoFormerConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.num_labels = config.num_labels
+
+        self.roformer = TFRoFormerMainLayer(config, name="roformer")
+        self.qa_outputs = keras.layers.Dense(
+            units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(ROFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFQuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFQuestionAnsweringModelOutput | tuple[tf.Tensor]:
+        r"""
+        start_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        outputs = self.roformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = outputs[0]
+        logits = self.qa_outputs(inputs=sequence_output)
+        start_logits, end_logits = tf.split(value=logits, num_or_size_splits=2, axis=-1)
+        start_logits = tf.squeeze(input=start_logits, axis=-1)
+        end_logits = tf.squeeze(input=end_logits, axis=-1)
+        loss = None
+
+        if start_positions is not None and end_positions is not None:
+            labels = {"start_position": start_positions, "end_position": end_positions}
+            loss = self.hf_compute_loss(labels=labels, logits=(start_logits, end_logits))
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFQuestionAnsweringModelOutput(
+            loss=loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roformer", None) is not None:
+            with tf.name_scope(self.roformer.name):
+                self.roformer.build(None)
+        if getattr(self, "qa_outputs", None) is not None:
+            with tf.name_scope(self.qa_outputs.name):
+                self.qa_outputs.build([None, None, self.config.hidden_size])
+
+
+__all__ = [
+    "TFRoFormerForCausalLM",
+    "TFRoFormerForMaskedLM",
+    "TFRoFormerForMultipleChoice",
+    "TFRoFormerForQuestionAnswering",
+    "TFRoFormerForSequenceClassification",
+    "TFRoFormerForTokenClassification",
+    "TFRoFormerLayer",
+    "TFRoFormerModel",
+    "TFRoFormerPreTrainedModel",
+]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/roformer/tokenization_roformer.py b/venv/lib/python3.13/site-packages/transformers/models/roformer/tokenization_roformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf10a1b528a99ee77c28a42b1bb14ed38a545024
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/roformer/tokenization_roformer.py
@@ -0,0 +1,511 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for RoFormer."""
+
+import collections
+import os
+import unicodedata
+from typing import Optional
+
+from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+
+
+# Copied from transformers.models.bert.tokenization_bert.load_vocab
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+
+
+# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
+class BasicTokenizer:
+    """
+    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
+
+    Args:
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
+    """
+
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
+        if never_split is None:
+            never_split = []
+        self.do_lower_case = do_lower_case
+        self.never_split = set(never_split)
+        self.tokenize_chinese_chars = tokenize_chinese_chars
+        self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
+
+    def tokenize(self, text, never_split=None):
+        """
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
+
+        Args:
+            never_split (`List[str]`, *optional*)
+                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
+                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
+        """
+        # union() returns a new set by concatenating the two sets.
+        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        if self.tokenize_chinese_chars:
+            text = self._tokenize_chinese_chars(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
+        split_tokens = []
+        for token in orig_tokens:
+            if token not in never_split:
+                if self.do_lower_case:
+                    token = token.lower()
+                    if self.strip_accents is not False:
+                        token = self._run_strip_accents(token)
+                elif self.strip_accents:
+                    token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token, never_split))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text, never_split=None):
+        """Splits punctuation on a piece of text."""
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)
+            or (cp >= 0x20000 and cp <= 0x2A6DF)
+            or (cp >= 0x2A700 and cp <= 0x2B73F)
+            or (cp >= 0x2B740 and cp <= 0x2B81F)
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)
+        ):
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xFFFD or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
+class WordpieceTokenizer:
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """
+        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
+        tokenization using the given vocabulary.
+
+        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
+
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through *BasicTokenizer*.
+
+        Returns:
+            A list of wordpiece tokens.
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+class RoFormerTokenizer(PreTrainedTokenizer):
+    r"""
+    Construct a RoFormer tokenizer. Based on [Rust Jieba](https://pypi.org/project/rjieba/).
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
+            Whether or not to do basic tokenization before WordPiece.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+
+    Example:
+
+    ```python
+    >>> from transformers import RoFormerTokenizer
+
+    >>> tokenizer = RoFormerTokenizer.from_pretrained("junnyu/roformer_chinese_base")
+    >>> tokenizer.tokenize("今天天气非常好。")
+    ['今', '天', '天', '气', '非常', '好', '。']
+    ```"""
+
+    vocab_files_names = VOCAB_FILES_NAMES
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        never_split=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        **kwargs,
+    ):
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
+                " model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case,
+                never_split=never_split,
+                tokenize_chinese_chars=tokenize_chinese_chars,
+                strip_accents=strip_accents,
+            )
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+        try:
+            import rjieba
+        except ImportError:
+            raise ImportError(
+                "You need to install rjieba to use RoFormerTokenizer. "
+                "See https://pypi.org/project/rjieba/ for installation."
+            )
+        self.jieba = rjieba
+
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
+
+    @property
+    def do_lower_case(self):
+        return self.basic_tokenizer.do_lower_case
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["jieba"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+        import rjieba
+
+        self.jieba = rjieba
+
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+
+    def _tokenize(self, text, use_jieba=True):
+        split_tokens = []
+        if use_jieba:
+            for wholword in self.jieba.cut(text, False):
+                if wholword in self.vocab:
+                    split_tokens.append(wholword)
+                else:
+                    # use bert tokenizer to _tokenize
+                    char_list = self._tokenize(wholword, use_jieba=False)
+                    split_tokens.extend(char_list)
+        else:
+            if self.do_basic_tokenize:
+                for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
+                    # If the token is part of the never_split set
+                    if token in self.basic_tokenizer.never_split:
+                        split_tokens.append(token)
+                    else:
+                        split_tokens += self.wordpiece_tokenizer.tokenize(token)
+            else:
+                split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A RoFormer sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+            )
+        else:
+            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)
+
+
+__all__ = ["RoFormerTokenizer"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/roformer/tokenization_roformer_fast.py b/venv/lib/python3.13/site-packages/transformers/models/roformer/tokenization_roformer_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9704aa7190d79a5b3d85058aca7a39cb5780798
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/roformer/tokenization_roformer_fast.py
@@ -0,0 +1,151 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for RoFormer."""
+
+import json
+from typing import Optional
+
+from tokenizers import normalizers
+from tokenizers.pre_tokenizers import BertPreTokenizer, PreTokenizer
+
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import logging
+from .tokenization_roformer import RoFormerTokenizer
+from .tokenization_utils import JiebaPreTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
+
+
+class RoFormerTokenizerFast(PreTrainedTokenizerFast):
+    r"""
+    Construct a "fast" RoFormer tokenizer (backed by HuggingFace's *tokenizers* library).
+
+    [`RoFormerTokenizerFast`] is almost identical to [`BertTokenizerFast`] and runs end-to-end tokenization:
+    punctuation splitting and wordpiece. There are some difference between them when tokenizing Chinese.
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Example:
+
+    ```python
+    >>> from transformers import RoFormerTokenizerFast
+
+    >>> tokenizer = RoFormerTokenizerFast.from_pretrained("junnyu/roformer_chinese_base")
+    >>> tokenizer.tokenize("今天天气非常好。")
+    ['今', '天', '天', '气', '非常', '好', '。']
+    ```"""
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    slow_tokenizer_class = RoFormerTokenizer
+
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        do_lower_case=True,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            do_lower_case=do_lower_case,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
+
+        normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
+        if (
+            normalizer_state.get("lowercase", do_lower_case) != do_lower_case
+            or normalizer_state.get("strip_accents", strip_accents) != strip_accents
+        ):
+            normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
+            normalizer_state["lowercase"] = do_lower_case
+            normalizer_state["strip_accents"] = strip_accents
+            self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
+
+        # Make sure we correctly set the custom PreTokenizer
+        vocab = self.backend_tokenizer.get_vocab()
+        self.backend_tokenizer.pre_tokenizer = PreTokenizer.custom(JiebaPreTokenizer(vocab))
+
+        self.do_lower_case = do_lower_case
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["_tokenizer"].pre_tokenizer = BertPreTokenizer()
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+        vocab = self.__dict__["_tokenizer"].get_vocab()
+        self.__dict__["_tokenizer"].pre_tokenizer = PreTokenizer.custom(JiebaPreTokenizer(vocab))
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A RoFormer sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+
+        if token_ids_1 is not None:
+            output += token_ids_1 + [self.sep_token_id]
+
+        return output
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+        return tuple(files)
+
+    def save_pretrained(
+        self,
+        save_directory,
+        legacy_format=None,
+        filename_prefix=None,
+        push_to_hub=False,
+        **kwargs,
+    ):
+        self.backend_tokenizer.pre_tokenizer = BertPreTokenizer()
+        return super().save_pretrained(save_directory, legacy_format, filename_prefix, push_to_hub, **kwargs)
+
+
+__all__ = ["RoFormerTokenizerFast"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/roformer/tokenization_utils.py b/venv/lib/python3.13/site-packages/transformers/models/roformer/tokenization_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fe681500d171712088147d6bd17b80674cd68e1
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/roformer/tokenization_utils.py
@@ -0,0 +1,66 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization utils for RoFormer."""
+
+from tokenizers import NormalizedString, PreTokenizedString, normalizers
+
+
+class JiebaPreTokenizer:
+    def __init__(self, vocab) -> None:
+        self.vocab = vocab
+        self.normalizers = normalizers.BertNormalizer(
+            clean_text=False,
+            handle_chinese_chars=True,
+            strip_accents=False,
+            lowercase=False,
+        )
+        try:
+            import rjieba
+        except ImportError:
+            raise ImportError(
+                "You need to install rjieba to use RoFormerTokenizer. "
+                "See https://pypi.org/project/rjieba/ for installation."
+            )
+        self.jieba = rjieba
+
+    def jieba_split(self, i: int, normalized_string: NormalizedString) -> list[NormalizedString]:
+        splits = []
+
+        # this code slice normalized_string is too slow (6s) but test_alignment_methods can pass
+        for token, start, end in self.jieba.tokenize(str(normalized_string), hmm=False):
+            if token in self.vocab:
+                splits.append(normalized_string[start:end])
+            else:
+                token_list = self.normalizers.normalize_str(token).split()
+                for token in token_list:
+                    if token:
+                        end = start + len(token)
+                        splits.append(normalized_string[start:end])
+                        start = end
+
+        # this code test_alignment_methods can't pass but fast (300ms)
+        # for token in self.jieba.cut(str(normalized_string), False):
+        #     if token in self.vocab:
+        #         splits.append(NormalizedString(token))
+        #     else:
+        #         token_list = self.normalizers.normalize_str(token).split()
+        #         for token in token_list:
+        #             if token:
+        #                 splits.append(NormalizedString(token))
+
+        return splits
+
+    def pre_tokenize(self, pretok: PreTokenizedString):
+        pretok.split(self.jieba_split)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/rt_detr/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/rt_detr/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd1fac5b679dc5ca527dcc0a6a45b9a873daa3dc
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/rt_detr/__init__.py
@@ -0,0 +1,33 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_rt_detr import *
+    from .configuration_rt_detr_resnet import *
+    from .image_processing_rt_detr import *
+    from .image_processing_rt_detr_fast import *
+    from .modeling_rt_detr import *
+    from .modeling_rt_detr_resnet import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/rt_detr/configuration_rt_detr.py b/venv/lib/python3.13/site-packages/transformers/models/rt_detr/configuration_rt_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..994d4a6fd6f02a9c11817c3ad1bdcdd83a14bb24
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/rt_detr/configuration_rt_detr.py
@@ -0,0 +1,372 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""RT-DETR model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ...utils.backbone_utils import verify_backbone_config_arguments
+from ..auto import CONFIG_MAPPING
+from .configuration_rt_detr_resnet import RTDetrResNetConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class RTDetrConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`RTDetrModel`]. It is used to instantiate a
+    RT-DETR model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the RT-DETR
+    [PekingU/rtdetr_r50vd](https://huggingface.co/PekingU/rtdetr_r50vd) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        initializer_range (`float`, *optional*, defaults to 0.01):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_bias_prior_prob (`float`, *optional*):
+            The prior probability used by the bias initializer to initialize biases for `enc_score_head` and `class_embed`.
+            If `None`, `prior_prob` computed as `prior_prob = 1 / (num_labels + 1)` while initializing model weights.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        batch_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the batch normalization layers.
+        backbone_config (`Dict`, *optional*, defaults to `RTDetrResNetConfig()`):
+            The configuration of the backbone model.
+        backbone (`str`, *optional*):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to use pretrained weights for the backbone.
+        use_timm_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
+            library.
+        freeze_backbone_batch_norms (`bool`, *optional*, defaults to `True`):
+            Whether to freeze the batch normalization layers in the backbone.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+        encoder_hidden_dim (`int`, *optional*, defaults to 256):
+            Dimension of the layers in hybrid encoder.
+        encoder_in_channels (`list`, *optional*, defaults to `[512, 1024, 2048]`):
+            Multi level features input for encoder.
+        feat_strides (`list[int]`, *optional*, defaults to `[8, 16, 32]`):
+            Strides used in each feature map.
+        encoder_layers (`int`, *optional*, defaults to 1):
+            Total of layers to be used by the encoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 1024):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The ratio for all dropout layers.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        encode_proj_layers (`list[int]`, *optional*, defaults to `[2]`):
+            Indexes of the projected layers to be used in the encoder.
+        positional_encoding_temperature (`int`, *optional*, defaults to 10000):
+            The temperature parameter used to create the positional encodings.
+        encoder_activation_function (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        activation_function (`str`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the general layer. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        eval_size (`tuple[int, int]`, *optional*):
+            Height and width used to computes the effective height and width of the position embeddings after taking
+            into account the stride.
+        normalize_before (`bool`, *optional*, defaults to `False`):
+            Determine whether to apply layer normalization in the transformer encoder layer before self-attention and
+            feed-forward modules.
+        hidden_expansion (`float`, *optional*, defaults to 1.0):
+            Expansion ratio to enlarge the dimension size of RepVGGBlock and CSPRepLayer.
+        d_model (`int`, *optional*, defaults to 256):
+            Dimension of the layers exclude hybrid encoder.
+        num_queries (`int`, *optional*, defaults to 300):
+            Number of object queries.
+        decoder_in_channels (`list`, *optional*, defaults to `[256, 256, 256]`):
+            Multi level features dimension for decoder
+        decoder_ffn_dim (`int`, *optional*, defaults to 1024):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        num_feature_levels (`int`, *optional*, defaults to 3):
+            The number of input feature levels.
+        decoder_n_points (`int`, *optional*, defaults to 4):
+            The number of sampled keys in each feature level for each attention head in the decoder.
+        decoder_layers (`int`, *optional*, defaults to 6):
+            Number of decoder layers.
+        decoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_activation_function (`str`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the decoder. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        num_denoising (`int`, *optional*, defaults to 100):
+            The total number of denoising tasks or queries to be used for contrastive denoising.
+        label_noise_ratio (`float`, *optional*, defaults to 0.5):
+            The fraction of denoising labels to which random noise should be added.
+        box_noise_scale (`float`, *optional*, defaults to 1.0):
+            Scale or magnitude of noise to be added to the bounding boxes.
+        learn_initial_query (`bool`, *optional*, defaults to `False`):
+            Indicates whether the initial query embeddings for the decoder should be learned during training
+        anchor_image_size (`tuple[int, int]`, *optional*):
+            Height and width of the input image used during evaluation to generate the bounding box anchors. If None, automatic generate anchor is applied.
+        disable_custom_kernels (`bool`, *optional*, defaults to `True`):
+            Whether to disable custom kernels.
+        with_box_refine (`bool`, *optional*, defaults to `True`):
+            Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes
+            based on the predictions from the previous layer.
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Whether the architecture has an encoder decoder structure.
+        matcher_alpha (`float`, *optional*, defaults to 0.25):
+            Parameter alpha used by the Hungarian Matcher.
+        matcher_gamma (`float`, *optional*, defaults to 2.0):
+            Parameter gamma used by the Hungarian Matcher.
+        matcher_class_cost (`float`, *optional*, defaults to 2.0):
+            The relative weight of the class loss used by the Hungarian Matcher.
+        matcher_bbox_cost (`float`, *optional*, defaults to 5.0):
+            The relative weight of the bounding box loss used by the Hungarian Matcher.
+        matcher_giou_cost (`float`, *optional*, defaults to 2.0):
+            The relative weight of the giou loss of used by the Hungarian Matcher.
+        use_focal_loss (`bool`, *optional*, defaults to `True`):
+            Parameter informing if focal focal should be used.
+        auxiliary_loss (`bool`, *optional*, defaults to `True`):
+            Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
+        focal_loss_alpha (`float`, *optional*, defaults to 0.75):
+            Parameter alpha used to compute the focal loss.
+        focal_loss_gamma (`float`, *optional*, defaults to 2.0):
+            Parameter gamma used to compute the focal loss.
+        weight_loss_vfl (`float`, *optional*, defaults to 1.0):
+            Relative weight of the varifocal loss in the object detection loss.
+        weight_loss_bbox (`float`, *optional*, defaults to 5.0):
+            Relative weight of the L1 bounding box loss in the object detection loss.
+        weight_loss_giou (`float`, *optional*, defaults to 2.0):
+            Relative weight of the generalized IoU loss in the object detection loss.
+        eos_coefficient (`float`, *optional*, defaults to 0.0001):
+            Relative classification weight of the 'no-object' class in the object detection loss.
+
+    Examples:
+
+    ```python
+    >>> from transformers import RTDetrConfig, RTDetrModel
+
+    >>> # Initializing a RT-DETR configuration
+    >>> configuration = RTDetrConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = RTDetrModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "rt_detr"
+    layer_types = ["basic", "bottleneck"]
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "encoder_attention_heads",
+    }
+
+    def __init__(
+        self,
+        initializer_range=0.01,
+        initializer_bias_prior_prob=None,
+        layer_norm_eps=1e-5,
+        batch_norm_eps=1e-5,
+        # backbone
+        backbone_config=None,
+        backbone=None,
+        use_pretrained_backbone=False,
+        use_timm_backbone=False,
+        freeze_backbone_batch_norms=True,
+        backbone_kwargs=None,
+        # encoder HybridEncoder
+        encoder_hidden_dim=256,
+        encoder_in_channels=[512, 1024, 2048],
+        feat_strides=[8, 16, 32],
+        encoder_layers=1,
+        encoder_ffn_dim=1024,
+        encoder_attention_heads=8,
+        dropout=0.0,
+        activation_dropout=0.0,
+        encode_proj_layers=[2],
+        positional_encoding_temperature=10000,
+        encoder_activation_function="gelu",
+        activation_function="silu",
+        eval_size=None,
+        normalize_before=False,
+        hidden_expansion=1.0,
+        # decoder RTDetrTransformer
+        d_model=256,
+        num_queries=300,
+        decoder_in_channels=[256, 256, 256],
+        decoder_ffn_dim=1024,
+        num_feature_levels=3,
+        decoder_n_points=4,
+        decoder_layers=6,
+        decoder_attention_heads=8,
+        decoder_activation_function="relu",
+        attention_dropout=0.0,
+        num_denoising=100,
+        label_noise_ratio=0.5,
+        box_noise_scale=1.0,
+        learn_initial_query=False,
+        anchor_image_size=None,
+        disable_custom_kernels=True,
+        with_box_refine=True,
+        is_encoder_decoder=True,
+        # Loss
+        matcher_alpha=0.25,
+        matcher_gamma=2.0,
+        matcher_class_cost=2.0,
+        matcher_bbox_cost=5.0,
+        matcher_giou_cost=2.0,
+        use_focal_loss=True,
+        auxiliary_loss=True,
+        focal_loss_alpha=0.75,
+        focal_loss_gamma=2.0,
+        weight_loss_vfl=1.0,
+        weight_loss_bbox=5.0,
+        weight_loss_giou=2.0,
+        eos_coefficient=1e-4,
+        **kwargs,
+    ):
+        self.initializer_range = initializer_range
+        self.initializer_bias_prior_prob = initializer_bias_prior_prob
+        self.layer_norm_eps = layer_norm_eps
+        self.batch_norm_eps = batch_norm_eps
+        # backbone
+        if backbone_config is None and backbone is None:
+            logger.info(
+                "`backbone_config` and `backbone` are `None`. Initializing the config with the default `RTDetr-ResNet` backbone."
+            )
+            backbone_config = RTDetrResNetConfig(
+                num_channels=3,
+                embedding_size=64,
+                hidden_sizes=[256, 512, 1024, 2048],
+                depths=[3, 4, 6, 3],
+                layer_type="bottleneck",
+                hidden_act="relu",
+                downsample_in_first_stage=False,
+                downsample_in_bottleneck=False,
+                out_features=None,
+                out_indices=[2, 3, 4],
+            )
+        elif isinstance(backbone_config, dict):
+            backbone_model_type = backbone_config.pop("model_type")
+            config_class = CONFIG_MAPPING[backbone_model_type]
+            backbone_config = config_class.from_dict(backbone_config)
+
+        verify_backbone_config_arguments(
+            use_timm_backbone=use_timm_backbone,
+            use_pretrained_backbone=use_pretrained_backbone,
+            backbone=backbone,
+            backbone_config=backbone_config,
+            backbone_kwargs=backbone_kwargs,
+        )
+
+        self.backbone_config = backbone_config
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
+        self.use_timm_backbone = use_timm_backbone
+        self.freeze_backbone_batch_norms = freeze_backbone_batch_norms
+        self.backbone_kwargs = backbone_kwargs
+        # encoder
+        self.encoder_hidden_dim = encoder_hidden_dim
+        self.encoder_in_channels = encoder_in_channels
+        self.feat_strides = feat_strides
+        self.encoder_attention_heads = encoder_attention_heads
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.dropout = dropout
+        self.activation_dropout = activation_dropout
+        self.encode_proj_layers = encode_proj_layers
+        self.encoder_layers = encoder_layers
+        self.positional_encoding_temperature = positional_encoding_temperature
+        self.eval_size = eval_size
+        self.normalize_before = normalize_before
+        self.encoder_activation_function = encoder_activation_function
+        self.activation_function = activation_function
+        self.hidden_expansion = hidden_expansion
+        # decoder
+        self.d_model = d_model
+        self.num_queries = num_queries
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_in_channels = decoder_in_channels
+        self.num_feature_levels = num_feature_levels
+        self.decoder_n_points = decoder_n_points
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.decoder_activation_function = decoder_activation_function
+        self.attention_dropout = attention_dropout
+        self.num_denoising = num_denoising
+        self.label_noise_ratio = label_noise_ratio
+        self.box_noise_scale = box_noise_scale
+        self.learn_initial_query = learn_initial_query
+        self.anchor_image_size = anchor_image_size
+        self.auxiliary_loss = auxiliary_loss
+        self.disable_custom_kernels = disable_custom_kernels
+        self.with_box_refine = with_box_refine
+        # Loss
+        self.matcher_alpha = matcher_alpha
+        self.matcher_gamma = matcher_gamma
+        self.matcher_class_cost = matcher_class_cost
+        self.matcher_bbox_cost = matcher_bbox_cost
+        self.matcher_giou_cost = matcher_giou_cost
+        self.use_focal_loss = use_focal_loss
+        self.focal_loss_alpha = focal_loss_alpha
+        self.focal_loss_gamma = focal_loss_gamma
+        self.weight_loss_vfl = weight_loss_vfl
+        self.weight_loss_bbox = weight_loss_bbox
+        self.weight_loss_giou = weight_loss_giou
+        self.eos_coefficient = eos_coefficient
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
+
+    @property
+    def sub_configs(self):
+        return (
+            {"backbone_config": type(self.backbone_config)}
+            if getattr(self, "backbone_config", None) is not None
+            else {}
+        )
+
+    @classmethod
+    def from_backbone_configs(cls, backbone_config: PretrainedConfig, **kwargs):
+        """Instantiate a [`RTDetrConfig`] (or a derived class) from a pre-trained backbone model configuration and DETR model
+        configuration.
+
+            Args:
+                backbone_config ([`PretrainedConfig`]):
+                    The backbone configuration.
+
+            Returns:
+                [`RTDetrConfig`]: An instance of a configuration object
+        """
+        return cls(
+            backbone_config=backbone_config,
+            **kwargs,
+        )
+
+
+__all__ = ["RTDetrConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/rt_detr/configuration_rt_detr_resnet.py b/venv/lib/python3.13/site-packages/transformers/models/rt_detr/configuration_rt_detr_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..73b9517ab1498fe7fb91fd8f67a0827a55cacc7d
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/rt_detr/configuration_rt_detr_resnet.py
@@ -0,0 +1,114 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""RT-DETR ResNet model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+logger = logging.get_logger(__name__)
+
+
+class RTDetrResNetConfig(BackboneConfigMixin, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`RTDetrResnetBackbone`]. It is used to instantiate an
+    ResNet model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the ResNet
+    [microsoft/resnet-50](https://huggingface.co/microsoft/resnet-50) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        embedding_size (`int`, *optional*, defaults to 64):
+            Dimensionality (hidden size) for the embedding layer.
+        hidden_sizes (`list[int]`, *optional*, defaults to `[256, 512, 1024, 2048]`):
+            Dimensionality (hidden size) at each stage.
+        depths (`list[int]`, *optional*, defaults to `[3, 4, 6, 3]`):
+            Depth (number of layers) for each stage.
+        layer_type (`str`, *optional*, defaults to `"bottleneck"`):
+            The layer to use, it can be either `"basic"` (used for smaller models, like resnet-18 or resnet-34) or
+            `"bottleneck"` (used for larger models like resnet-50 and above).
+        hidden_act (`str`, *optional*, defaults to `"relu"`):
+            The non-linear activation function in each block. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"`
+            are supported.
+        downsample_in_first_stage (`bool`, *optional*, defaults to `False`):
+            If `True`, the first stage will downsample the inputs using a `stride` of 2.
+        downsample_in_bottleneck (`bool`, *optional*, defaults to `False`):
+            If `True`, the first conv 1x1 in ResNetBottleNeckLayer will downsample the inputs using a `stride` of 2.
+        out_features (`list[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        out_indices (`list[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+
+    Example:
+    ```python
+    >>> from transformers import RTDetrResNetConfig, RTDetrResnetBackbone
+
+    >>> # Initializing a ResNet resnet-50 style configuration
+    >>> configuration = RTDetrResNetConfig()
+
+    >>> # Initializing a model (with random weights) from the resnet-50 style configuration
+    >>> model = RTDetrResnetBackbone(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "rt_detr_resnet"
+    layer_types = ["basic", "bottleneck"]
+
+    def __init__(
+        self,
+        num_channels=3,
+        embedding_size=64,
+        hidden_sizes=[256, 512, 1024, 2048],
+        depths=[3, 4, 6, 3],
+        layer_type="bottleneck",
+        hidden_act="relu",
+        downsample_in_first_stage=False,
+        downsample_in_bottleneck=False,
+        out_features=None,
+        out_indices=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if layer_type not in self.layer_types:
+            raise ValueError(f"layer_type={layer_type} is not one of {','.join(self.layer_types)}")
+        self.num_channels = num_channels
+        self.embedding_size = embedding_size
+        self.hidden_sizes = hidden_sizes
+        self.depths = depths
+        self.layer_type = layer_type
+        self.hidden_act = hidden_act
+        self.downsample_in_first_stage = downsample_in_first_stage
+        self.downsample_in_bottleneck = downsample_in_bottleneck
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+
+
+__all__ = ["RTDetrResNetConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/rt_detr/image_processing_rt_detr.py b/venv/lib/python3.13/site-packages/transformers/models/rt_detr/image_processing_rt_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..de61a8019047dbc97761c151d345d06d16cdca4d
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/rt_detr/image_processing_rt_detr.py
@@ -0,0 +1,1103 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for RT-DETR."""
+
+import pathlib
+from collections.abc import Iterable
+from typing import Any, Callable, Optional, Union
+
+import numpy as np
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_processing_utils import BaseImageProcessor, get_size_dict
+from ...image_transforms import (
+    PaddingMode,
+    center_to_corners_format,
+    corners_to_center_format,
+    pad,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    AnnotationFormat,
+    AnnotationType,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_flat_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_annotations,
+    validate_preprocess_arguments,
+)
+from ...utils import (
+    filter_out_non_signature_kwargs,
+    is_flax_available,
+    is_jax_tensor,
+    is_tf_available,
+    is_tf_tensor,
+    is_torch_available,
+    is_torch_tensor,
+    logging,
+    requires_backends,
+)
+from ...utils.generic import TensorType
+
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION,)
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio
+def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]:
+    """
+    Computes the output image size given the input image size and the desired output size.
+
+    Args:
+        image_size (`tuple[int, int]`):
+            The input image size.
+        size (`int`):
+            The desired output size.
+        max_size (`int`, *optional*):
+            The maximum allowed output size.
+    """
+    height, width = image_size
+    raw_size = None
+    if max_size is not None:
+        min_original_size = float(min((height, width)))
+        max_original_size = float(max((height, width)))
+        if max_original_size / min_original_size * size > max_size:
+            raw_size = max_size * min_original_size / max_original_size
+            size = int(round(raw_size))
+
+    if (height <= width and height == size) or (width <= height and width == size):
+        oh, ow = height, width
+    elif width < height:
+        ow = size
+        if max_size is not None and raw_size is not None:
+            oh = int(raw_size * height / width)
+        else:
+            oh = int(size * height / width)
+    else:
+        oh = size
+        if max_size is not None and raw_size is not None:
+            ow = int(raw_size * width / height)
+        else:
+            ow = int(size * width / height)
+
+    return (oh, ow)
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size
+def get_resize_output_image_size(
+    input_image: np.ndarray,
+    size: Union[int, tuple[int, int], list[int]],
+    max_size: Optional[int] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> tuple[int, int]:
+    """
+    Computes the output image size given the input image size and the desired output size. If the desired output size
+    is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output
+    image size is computed by keeping the aspect ratio of the input image size.
+
+    Args:
+        input_image (`np.ndarray`):
+            The image to resize.
+        size (`int` or `tuple[int, int]` or `list[int]`):
+            The desired output size.
+        max_size (`int`, *optional*):
+            The maximum allowed output size.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
+    """
+    image_size = get_image_size(input_image, input_data_format)
+    if isinstance(size, (list, tuple)):
+        return size
+
+    return get_size_with_aspect_ratio(image_size, size, max_size)
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width
+def get_image_size_for_max_height_width(
+    input_image: np.ndarray,
+    max_height: int,
+    max_width: int,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> tuple[int, int]:
+    """
+    Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
+    Important, even if image_height < max_height and image_width < max_width, the image will be resized
+    to at least one of the edges be equal to max_height or max_width.
+    For example:
+        - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
+        - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
+    Args:
+        input_image (`np.ndarray`):
+            The image to resize.
+        max_height (`int`):
+            The maximum allowed height.
+        max_width (`int`):
+            The maximum allowed width.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
+    """
+    image_size = get_image_size(input_image, input_data_format)
+    height, width = image_size
+    height_scale = max_height / height
+    width_scale = max_width / width
+    min_scale = min(height_scale, width_scale)
+    new_height = int(height * min_scale)
+    new_width = int(width * min_scale)
+    return new_height, new_width
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
+def get_numpy_to_framework_fn(arr) -> Callable:
+    """
+    Returns a function that converts a numpy array to the framework of the input array.
+
+    Args:
+        arr (`np.ndarray`): The array to convert.
+    """
+    if isinstance(arr, np.ndarray):
+        return np.array
+    if is_tf_available() and is_tf_tensor(arr):
+        import tensorflow as tf
+
+        return tf.convert_to_tensor
+    if is_torch_available() and is_torch_tensor(arr):
+        import torch
+
+        return torch.tensor
+    if is_flax_available() and is_jax_tensor(arr):
+        import jax.numpy as jnp
+
+        return jnp.array
+    raise ValueError(f"Cannot convert arrays of type {type(arr)}")
+
+
+# Copied from transformers.models.detr.image_processing_detr.safe_squeeze
+def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
+    """
+    Squeezes an array, but only if the axis specified has dim 1.
+    """
+    if axis is None:
+        return arr.squeeze()
+
+    try:
+        return arr.squeeze(axis=axis)
+    except ValueError:
+        return arr
+
+
+# Copied from transformers.models.detr.image_processing_detr.normalize_annotation
+def normalize_annotation(annotation: dict, image_size: tuple[int, int]) -> dict:
+    image_height, image_width = image_size
+    norm_annotation = {}
+    for key, value in annotation.items():
+        if key == "boxes":
+            boxes = value
+            boxes = corners_to_center_format(boxes)
+            boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32)
+            norm_annotation[key] = boxes
+        else:
+            norm_annotation[key] = value
+    return norm_annotation
+
+
+# Copied from transformers.models.detr.image_processing_detr.max_across_indices
+def max_across_indices(values: Iterable[Any]) -> list[Any]:
+    """
+    Return the maximum value across all indices of an iterable of values.
+    """
+    return [max(values_i) for values_i in zip(*values)]
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_max_height_width
+def get_max_height_width(
+    images: list[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> list[int]:
+    """
+    Get the maximum height and width across all images in a batch.
+    """
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(images[0])
+
+    if input_data_format == ChannelDimension.FIRST:
+        _, max_height, max_width = max_across_indices([img.shape for img in images])
+    elif input_data_format == ChannelDimension.LAST:
+        max_height, max_width, _ = max_across_indices([img.shape for img in images])
+    else:
+        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
+    return (max_height, max_width)
+
+
+# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask
+def make_pixel_mask(
+    image: np.ndarray, output_size: tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> np.ndarray:
+    """
+    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+
+    Args:
+        image (`np.ndarray`):
+            Image to make the pixel mask for.
+        output_size (`tuple[int, int]`):
+            Output size of the mask.
+    """
+    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+    mask = np.zeros(output_size, dtype=np.int64)
+    mask[:input_height, :input_width] = 1
+    return mask
+
+
+def prepare_coco_detection_annotation(
+    image,
+    target,
+    return_segmentation_masks: bool = False,
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
+    """
+    Convert the target in COCO format into the format expected by RTDETR.
+    """
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
+
+    image_id = target["image_id"]
+    image_id = np.asarray([image_id], dtype=np.int64)
+
+    # Get all COCO annotations for the given image.
+    annotations = target["annotations"]
+    annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0]
+
+    classes = [obj["category_id"] for obj in annotations]
+    classes = np.asarray(classes, dtype=np.int64)
+
+    # for conversion to coco api
+    area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32)
+    iscrowd = np.asarray([obj.get("iscrowd", 0) for obj in annotations], dtype=np.int64)
+
+    boxes = [obj["bbox"] for obj in annotations]
+    # guard against no boxes via resizing
+    boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
+    boxes[:, 2:] += boxes[:, :2]
+    boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
+    boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
+
+    keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+
+    new_target = {}
+    new_target["image_id"] = image_id
+    new_target["class_labels"] = classes[keep]
+    new_target["boxes"] = boxes[keep]
+    new_target["area"] = area[keep]
+    new_target["iscrowd"] = iscrowd[keep]
+    new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64)
+
+    if annotations and "keypoints" in annotations[0]:
+        keypoints = [obj["keypoints"] for obj in annotations]
+        # Converting the filtered keypoints list to a numpy array
+        keypoints = np.asarray(keypoints, dtype=np.float32)
+        # Apply the keep mask here to filter the relevant annotations
+        keypoints = keypoints[keep]
+        num_keypoints = keypoints.shape[0]
+        keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
+        new_target["keypoints"] = keypoints
+
+    return new_target
+
+
+# Copied from transformers.models.detr.image_processing_detr.resize_annotation
+def resize_annotation(
+    annotation: dict[str, Any],
+    orig_size: tuple[int, int],
+    target_size: tuple[int, int],
+    threshold: float = 0.5,
+    resample: PILImageResampling = PILImageResampling.NEAREST,
+):
+    """
+    Resizes an annotation to a target size.
+
+    Args:
+        annotation (`dict[str, Any]`):
+            The annotation dictionary.
+        orig_size (`tuple[int, int]`):
+            The original size of the input image.
+        target_size (`tuple[int, int]`):
+            The target size of the image, as returned by the preprocessing `resize` step.
+        threshold (`float`, *optional*, defaults to 0.5):
+            The threshold used to binarize the segmentation masks.
+        resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`):
+            The resampling filter to use when resizing the masks.
+    """
+    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size))
+    ratio_height, ratio_width = ratios
+
+    new_annotation = {}
+    new_annotation["size"] = target_size
+
+    for key, value in annotation.items():
+        if key == "boxes":
+            boxes = value
+            scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
+            new_annotation["boxes"] = scaled_boxes
+        elif key == "area":
+            area = value
+            scaled_area = area * (ratio_width * ratio_height)
+            new_annotation["area"] = scaled_area
+        elif key == "masks":
+            masks = value[:, None]
+            masks = np.array([resize(mask, target_size, resample=resample) for mask in masks])
+            masks = masks.astype(np.float32)
+            masks = masks[:, 0] > threshold
+            new_annotation["masks"] = masks
+        elif key == "size":
+            new_annotation["size"] = target_size
+        else:
+            new_annotation[key] = value
+
+    return new_annotation
+
+
+class RTDetrImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a RT-DETR image processor.
+
+    Args:
+        format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+            Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
+            overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`dict[str, int]` *optional*, defaults to `{"height": 640, "width": 640}`):
+            Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
+            in the `preprocess` method. Available options are:
+                - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                    Do NOT keep the aspect ratio.
+                - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                    the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                    less or equal to `longest_edge`.
+                - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                    aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                    `max_width`.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            Resampling filter to use if resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+            Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
+            `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `False`):
+            Whether to normalize the image.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
+            Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
+            channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
+            Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
+            for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_annotations (`bool`, *optional*, defaults to `True`):
+            Controls whether to convert the annotations to the format expected by the DETR model. Converts the
+            bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+            Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `False`):
+            Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
+            method. If `True`, padding will be applied to the bottom and right of the image with zeros.
+            If `pad_size` is provided, the image will be padded to the specified dimensions.
+            Otherwise, the image will be padded to the maximum height and width of the batch.
+        pad_size (`dict[str, int]`, *optional*):
+            The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+            provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+            height and width in the batch.
+    """
+
+    model_input_names = ["pixel_values", "pixel_mask"]
+
+    def __init__(
+        self,
+        format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = False,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_annotations: bool = True,
+        do_pad: bool = False,
+        pad_size: Optional[dict[str, int]] = None,
+        **kwargs,
+    ) -> None:
+        size = size if size is not None else {"height": 640, "width": 640}
+        size = get_size_dict(size, default_to_square=False)
+
+        if do_convert_annotations is None:
+            do_convert_annotations = do_normalize
+
+        super().__init__(**kwargs)
+        self.format = format
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.do_convert_annotations = do_convert_annotations
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+        self.do_pad = do_pad
+        self.pad_size = pad_size
+
+    def prepare_annotation(
+        self,
+        image: np.ndarray,
+        target: dict,
+        format: Optional[AnnotationFormat] = None,
+        return_segmentation_masks: Optional[bool] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> dict:
+        """
+        Prepare an annotation for feeding into RTDETR model.
+        """
+        format = format if format is not None else self.format
+
+        if format == AnnotationFormat.COCO_DETECTION:
+            return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_detection_annotation(
+                image, target, return_segmentation_masks, input_data_format=input_data_format
+            )
+        else:
+            raise ValueError(f"Format {format} is not supported.")
+        return target
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
+        int, smaller edge of the image will be matched to this number.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Size of the image's `(height, width)` dimensions after resizing. Available options are:
+                    - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                        Do NOT keep the aspect ratio.
+                    - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                        the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                        less or equal to `longest_edge`.
+                    - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                        aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                        `max_width`.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                Resampling filter to use if resizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` parameter is deprecated and will be removed in v4.26. "
+                "Please specify in `size['longest_edge'] instead`.",
+            )
+            max_size = kwargs.pop("max_size")
+        else:
+            max_size = None
+        size = get_size_dict(size, max_size=max_size, default_to_square=False)
+        if "shortest_edge" in size and "longest_edge" in size:
+            new_size = get_resize_output_image_size(
+                image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
+            )
+        elif "max_height" in size and "max_width" in size:
+            new_size = get_image_size_for_max_height_width(
+                image, size["max_height"], size["max_width"], input_data_format=input_data_format
+            )
+        elif "height" in size and "width" in size:
+            new_size = (size["height"], size["width"])
+        else:
+            raise ValueError(
+                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
+                f" {size.keys()}."
+            )
+        image = resize(
+            image,
+            size=new_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+        return image
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation
+    def resize_annotation(
+        self,
+        annotation,
+        orig_size,
+        size,
+        resample: PILImageResampling = PILImageResampling.NEAREST,
+    ) -> dict:
+        """
+        Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched
+        to this number.
+        """
+        return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
+    def rescale(
+        self,
+        image: np.ndarray,
+        rescale_factor: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Rescale the image by the given factor. image = image * rescale_factor.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            rescale_factor (`float`):
+                The value to use for rescaling.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
+                one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        """
+        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation
+    def normalize_annotation(self, annotation: dict, image_size: tuple[int, int]) -> dict:
+        """
+        Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
+        `[center_x, center_y, width, height]` format and from absolute to relative pixel values.
+        """
+        return normalize_annotation(annotation, image_size=image_size)
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
+    def _update_annotation_for_padded_image(
+        self,
+        annotation: dict,
+        input_image_size: tuple[int, int],
+        output_image_size: tuple[int, int],
+        padding,
+        update_bboxes,
+    ) -> dict:
+        """
+        Update the annotation for a padded image.
+        """
+        new_annotation = {}
+        new_annotation["size"] = output_image_size
+
+        for key, value in annotation.items():
+            if key == "masks":
+                masks = value
+                masks = pad(
+                    masks,
+                    padding,
+                    mode=PaddingMode.CONSTANT,
+                    constant_values=0,
+                    input_data_format=ChannelDimension.FIRST,
+                )
+                masks = safe_squeeze(masks, 1)
+                new_annotation["masks"] = masks
+            elif key == "boxes" and update_bboxes:
+                boxes = value
+                boxes *= np.asarray(
+                    [
+                        input_image_size[1] / output_image_size[1],
+                        input_image_size[0] / output_image_size[0],
+                        input_image_size[1] / output_image_size[1],
+                        input_image_size[0] / output_image_size[0],
+                    ]
+                )
+                new_annotation["boxes"] = boxes
+            elif key == "size":
+                new_annotation["size"] = output_image_size
+            else:
+                new_annotation[key] = value
+        return new_annotation
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
+    def _pad_image(
+        self,
+        image: np.ndarray,
+        output_size: tuple[int, int],
+        annotation: Optional[dict[str, Any]] = None,
+        constant_values: Union[float, Iterable[float]] = 0,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        update_bboxes: bool = True,
+    ) -> np.ndarray:
+        """
+        Pad an image with zeros to the given size.
+        """
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+        output_height, output_width = output_size
+
+        pad_bottom = output_height - input_height
+        pad_right = output_width - input_width
+        padding = ((0, pad_bottom), (0, pad_right))
+        padded_image = pad(
+            image,
+            padding,
+            mode=PaddingMode.CONSTANT,
+            constant_values=constant_values,
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
+        if annotation is not None:
+            annotation = self._update_annotation_for_padded_image(
+                annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
+            )
+        return padded_image, annotation
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
+    def pad(
+        self,
+        images: list[np.ndarray],
+        annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
+        constant_values: Union[float, Iterable[float]] = 0,
+        return_pixel_mask: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        update_bboxes: bool = True,
+        pad_size: Optional[dict[str, int]] = None,
+    ) -> BatchFeature:
+        """
+        Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
+        in the batch and optionally returns their corresponding pixel mask.
+
+        Args:
+            images (list[`np.ndarray`]):
+                Images to pad.
+            annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
+                Annotations to transform according to the padding that is applied to the images.
+            constant_values (`float` or `Iterable[float]`, *optional*):
+                The value to use for the padding if `mode` is `"constant"`.
+            return_pixel_mask (`bool`, *optional*, defaults to `True`):
+                Whether to return a pixel mask.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+            update_bboxes (`bool`, *optional*, defaults to `True`):
+                Whether to update the bounding boxes in the annotations to match the padded images. If the
+                bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
+                format, the bounding boxes will not be updated.
+            pad_size (`dict[str, int]`, *optional*):
+                The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+                provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+                height and width in the batch.
+        """
+        pad_size = pad_size if pad_size is not None else self.pad_size
+        if pad_size is not None:
+            padded_size = (pad_size["height"], pad_size["width"])
+        else:
+            padded_size = get_max_height_width(images, input_data_format=input_data_format)
+
+        annotation_list = annotations if annotations is not None else [None] * len(images)
+        padded_images = []
+        padded_annotations = []
+        for image, annotation in zip(images, annotation_list):
+            padded_image, padded_annotation = self._pad_image(
+                image,
+                padded_size,
+                annotation,
+                constant_values=constant_values,
+                data_format=data_format,
+                input_data_format=input_data_format,
+                update_bboxes=update_bboxes,
+            )
+            padded_images.append(padded_image)
+            padded_annotations.append(padded_annotation)
+
+        data = {"pixel_values": padded_images}
+
+        if return_pixel_mask:
+            masks = [
+                make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format)
+                for image in images
+            ]
+            data["pixel_mask"] = masks
+
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+
+        if annotations is not None:
+            encoded_inputs["labels"] = [
+                BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
+            ]
+
+        return encoded_inputs
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
+        return_segmentation_masks: Optional[bool] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample=None,  # PILImageResampling
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[Union[int, float]] = None,
+        do_normalize: Optional[bool] = None,
+        do_convert_annotations: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_pad: Optional[bool] = None,
+        format: Optional[Union[str, AnnotationFormat]] = None,
+        return_tensors: Optional[Union[TensorType, str]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        pad_size: Optional[dict[str, int]] = None,
+    ) -> BatchFeature:
+        """
+        Preprocess an image or a batch of images so that it can be used by the model.
+
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging
+                from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
+                List of annotations associated with the image or batch of images. If annotation is for object
+                detection, the annotations should be a dictionary with the following keys:
+                - "image_id" (`int`): The image id.
+                - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a
+                  dictionary. An image can have no annotations, in which case the list should be empty.
+                If annotation is for segmentation, the annotations should be a dictionary with the following keys:
+                - "image_id" (`int`): The image id.
+                - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary.
+                  An image can have no segments, in which case the list should be empty.
+                - "file_name" (`str`): The file name of the image.
+            return_segmentation_masks (`bool`, *optional*, defaults to self.return_segmentation_masks):
+                Whether to return segmentation masks.
+            masks_path (`str` or `pathlib.Path`, *optional*):
+                Path to the directory containing the segmentation masks.
+            do_resize (`bool`, *optional*, defaults to self.do_resize):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to self.size):
+                Size of the image's `(height, width)` dimensions after resizing. Available options are:
+                    - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                        Do NOT keep the aspect ratio.
+                    - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                        the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                        less or equal to `longest_edge`.
+                    - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                        aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                        `max_width`.
+            resample (`PILImageResampling`, *optional*, defaults to self.resample):
+                Resampling filter to use when resizing the image.
+            do_rescale (`bool`, *optional*, defaults to self.do_rescale):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to self.rescale_factor):
+                Rescale factor to use when rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to self.do_normalize):
+                Whether to normalize the image.
+            do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
+                Whether to convert the annotations to the format expected by the model. Converts the bounding
+                boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
+                and in relative coordinates.
+            image_mean (`float` or `list[float]`, *optional*, defaults to self.image_mean):
+                Mean to use when normalizing the image.
+            image_std (`float` or `list[float]`, *optional*, defaults to self.image_std):
+                Standard deviation to use when normalizing the image.
+            do_pad (`bool`, *optional*, defaults to self.do_pad):
+                Whether to pad the image. If `True`, padding will be applied to the bottom and right of
+                the image with zeros. If `pad_size` is provided, the image will be padded to the specified
+                dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
+            format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
+                Format of the annotations.
+            return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
+                Type of tensors to return. If `None`, will return the list of images.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            pad_size (`dict[str, int]`, *optional*):
+                The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+                provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+                height and width in the batch.
+        """
+        do_resize = self.do_resize if do_resize is None else do_resize
+        size = self.size if size is None else size
+        size = get_size_dict(size=size, default_to_square=True)
+        resample = self.resample if resample is None else resample
+        do_rescale = self.do_rescale if do_rescale is None else do_rescale
+        rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor
+        do_normalize = self.do_normalize if do_normalize is None else do_normalize
+        image_mean = self.image_mean if image_mean is None else image_mean
+        image_std = self.image_std if image_std is None else image_std
+        do_convert_annotations = (
+            self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
+        )
+        do_pad = self.do_pad if do_pad is None else do_pad
+        pad_size = self.pad_size if pad_size is None else pad_size
+        format = self.format if format is None else format
+
+        images = make_flat_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        # Here, the pad() method pads to the maximum of (width, height). It does not need to be validated.
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        if annotations is not None and isinstance(annotations, dict):
+            annotations = [annotations]
+
+        if annotations is not None and len(images) != len(annotations):
+            raise ValueError(
+                f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
+            )
+
+        format = AnnotationFormat(format)
+        if annotations is not None:
+            validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
+
+        images = make_flat_list_of_images(images)
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        # All transformations expect numpy arrays
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
+        if annotations is not None:
+            prepared_images = []
+            prepared_annotations = []
+            for image, target in zip(images, annotations):
+                target = self.prepare_annotation(
+                    image,
+                    target,
+                    format,
+                    return_segmentation_masks=return_segmentation_masks,
+                    masks_path=masks_path,
+                    input_data_format=input_data_format,
+                )
+                prepared_images.append(image)
+                prepared_annotations.append(target)
+            images = prepared_images
+            annotations = prepared_annotations
+            del prepared_images, prepared_annotations
+
+        # transformations
+        if do_resize:
+            if annotations is not None:
+                resized_images, resized_annotations = [], []
+                for image, target in zip(images, annotations):
+                    orig_size = get_image_size(image, input_data_format)
+                    resized_image = self.resize(
+                        image, size=size, resample=resample, input_data_format=input_data_format
+                    )
+                    resized_annotation = self.resize_annotation(
+                        target, orig_size, get_image_size(resized_image, input_data_format)
+                    )
+                    resized_images.append(resized_image)
+                    resized_annotations.append(resized_annotation)
+                images = resized_images
+                annotations = resized_annotations
+                del resized_images, resized_annotations
+            else:
+                images = [
+                    self.resize(image, size=size, resample=resample, input_data_format=input_data_format)
+                    for image in images
+                ]
+
+        if do_rescale:
+            images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
+
+        if do_normalize:
+            images = [
+                self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
+            ]
+
+        if do_convert_annotations and annotations is not None:
+            annotations = [
+                self.normalize_annotation(annotation, get_image_size(image, input_data_format))
+                for annotation, image in zip(annotations, images)
+            ]
+
+        if do_pad:
+            # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
+            encoded_inputs = self.pad(
+                images,
+                annotations=annotations,
+                return_pixel_mask=True,
+                data_format=data_format,
+                input_data_format=input_data_format,
+                update_bboxes=do_convert_annotations,
+                return_tensors=return_tensors,
+                pad_size=pad_size,
+            )
+        else:
+            images = [
+                to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+                for image in images
+            ]
+            encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
+            if annotations is not None:
+                encoded_inputs["labels"] = [
+                    BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
+                ]
+
+        return encoded_inputs
+
+    def post_process_object_detection(
+        self,
+        outputs,
+        threshold: float = 0.5,
+        target_sizes: Union[TensorType, list[tuple]] = None,
+        use_focal_loss: bool = True,
+    ):
+        """
+        Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`DetrObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*, defaults to 0.5):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size
+                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
+            use_focal_loss (`bool` defaults to `True`):
+                Variable informing if the focal loss was used to predict the outputs. If `True`, a sigmoid is applied
+                to compute the scores of each detection, otherwise, a softmax function is used.
+
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        requires_backends(self, ["torch"])
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+        # convert from relative cxcywh to absolute xyxy
+        boxes = center_to_corners_format(out_bbox)
+        if target_sizes is not None:
+            if len(out_logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+            if isinstance(target_sizes, list):
+                img_h, img_w = torch.as_tensor(target_sizes).unbind(1)
+            else:
+                img_h, img_w = target_sizes.unbind(1)
+            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+            boxes = boxes * scale_fct[:, None, :]
+
+        num_top_queries = out_logits.shape[1]
+        num_classes = out_logits.shape[2]
+
+        if use_focal_loss:
+            scores = torch.nn.functional.sigmoid(out_logits)
+            scores, index = torch.topk(scores.flatten(1), num_top_queries, axis=-1)
+            labels = index % num_classes
+            index = index // num_classes
+            boxes = boxes.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, boxes.shape[-1]))
+        else:
+            scores = torch.nn.functional.softmax(out_logits)[:, :, :-1]
+            scores, labels = scores.max(dim=-1)
+            if scores.shape[1] > num_top_queries:
+                scores, index = torch.topk(scores, num_top_queries, dim=-1)
+                labels = torch.gather(labels, dim=1, index=index)
+                boxes = torch.gather(boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1]))
+
+        results = []
+        for score, label, box in zip(scores, labels, boxes):
+            results.append(
+                {
+                    "scores": score[score > threshold],
+                    "labels": label[score > threshold],
+                    "boxes": box[score > threshold],
+                }
+            )
+
+        return results
+
+
+__all__ = ["RTDetrImageProcessor"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/rt_detr/image_processing_rt_detr_fast.py b/venv/lib/python3.13/site-packages/transformers/models/rt_detr/image_processing_rt_detr_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..9aae271deaccbab9784dd7e02059eba29d32d5f2
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/rt_detr/image_processing_rt_detr_fast.py
@@ -0,0 +1,559 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/rt_detr/modular_rt_detr.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_rt_detr.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+import pathlib
+from typing import Any, Optional, Union
+
+import torch
+from torchvision.transforms.v2 import functional as F
+
+from ...image_processing_utils import BatchFeature
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    SizeDict,
+    get_image_size_for_max_height_width,
+    get_max_height_width,
+    safe_squeeze,
+)
+from ...image_transforms import center_to_corners_format, corners_to_center_format
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    AnnotationFormat,
+    AnnotationType,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    validate_annotations,
+)
+from ...processing_utils import Unpack
+from ...utils import TensorType, auto_docstring, requires_backends
+from ...utils.import_utils import requires
+from .image_processing_rt_detr import get_size_with_aspect_ratio
+
+
+class RTDetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    r"""
+    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+    do_convert_annotations (`bool`, *optional*, defaults to `True`):
+        Controls whether to convert the annotations to the format expected by the RT_DETR model. Converts the
+        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+    return_segmentation_masks (`bool`, *optional*, defaults to `False`):
+        Whether to return segmentation masks.
+    """
+
+    format: Optional[Union[str, AnnotationFormat]]
+    do_convert_annotations: Optional[bool]
+    return_segmentation_masks: Optional[bool]
+
+
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
+
+
+def prepare_coco_detection_annotation(
+    image,
+    target,
+    return_segmentation_masks: bool = False,
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
+    """
+    Convert the target in COCO format into the format expected by RT-DETR.
+    """
+    image_height, image_width = image.size()[-2:]
+
+    image_id = target["image_id"]
+    image_id = torch.as_tensor([image_id], dtype=torch.int64, device=image.device)
+
+    # Get all COCO annotations for the given image.
+    annotations = target["annotations"]
+    classes = []
+    area = []
+    boxes = []
+    keypoints = []
+    for obj in annotations:
+        if "iscrowd" not in obj or obj["iscrowd"] == 0:
+            classes.append(obj["category_id"])
+            area.append(obj["area"])
+            boxes.append(obj["bbox"])
+            if "keypoints" in obj:
+                keypoints.append(obj["keypoints"])
+
+    classes = torch.as_tensor(classes, dtype=torch.int64, device=image.device)
+    area = torch.as_tensor(area, dtype=torch.float32, device=image.device)
+    iscrowd = torch.zeros_like(classes, dtype=torch.int64, device=image.device)
+    # guard against no boxes via resizing
+    boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device).reshape(-1, 4)
+    boxes[:, 2:] += boxes[:, :2]
+    boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
+    boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
+
+    keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+
+    new_target = {
+        "image_id": image_id,
+        "class_labels": classes[keep],
+        "boxes": boxes[keep],
+        "area": area[keep],
+        "iscrowd": iscrowd[keep],
+        "orig_size": torch.as_tensor([int(image_height), int(image_width)], dtype=torch.int64, device=image.device),
+    }
+
+    if keypoints:
+        keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=image.device)
+        # Apply the keep mask here to filter the relevant annotations
+        keypoints = keypoints[keep]
+        num_keypoints = keypoints.shape[0]
+        keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
+        new_target["keypoints"] = keypoints
+
+    return new_target
+
+
+@auto_docstring
+@requires(backends=("torchvision", "torch"))
+class RTDetrImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BILINEAR
+    image_mean = IMAGENET_DEFAULT_MEAN
+    image_std = IMAGENET_DEFAULT_STD
+    format = AnnotationFormat.COCO_DETECTION
+    do_resize = True
+    do_rescale = True
+    do_normalize = False
+    do_pad = False
+    size = {"height": 640, "width": 640}
+    default_to_square = False
+    model_input_names = ["pixel_values", "pixel_mask"]
+    valid_kwargs = RTDetrFastImageProcessorKwargs
+    do_convert_annotations = True
+
+    def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorKwargs]) -> None:
+        # Backwards compatibility
+        do_convert_annotations = kwargs.get("do_convert_annotations")
+        do_normalize = kwargs.get("do_normalize")
+        if do_convert_annotations is None and getattr(self, "do_convert_annotations", None) is None:
+            self.do_convert_annotations = do_normalize if do_normalize is not None else self.do_normalize
+
+        super().__init__(**kwargs)
+
+    def prepare_annotation(
+        self,
+        image: torch.Tensor,
+        target: dict,
+        format: Optional[AnnotationFormat] = None,
+        return_segmentation_masks: Optional[bool] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> dict:
+        """
+        Prepare an annotation for feeding into RT_DETR model.
+        """
+        format = format if format is not None else self.format
+
+        if format == AnnotationFormat.COCO_DETECTION:
+            return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_detection_annotation(
+                image, target, return_segmentation_masks, input_data_format=input_data_format
+            )
+        else:
+            raise ValueError(f"Format {format} is not supported.")
+        return target
+
+    def resize(
+        self,
+        image: torch.Tensor,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
+        int, smaller edge of the image will be matched to this number.
+
+        Args:
+            image (`torch.Tensor`):
+                Image to resize.
+            size (`SizeDict`):
+                Size of the image's `(height, width)` dimensions after resizing. Available options are:
+                    - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                        Do NOT keep the aspect ratio.
+                    - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                        the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                        less or equal to `longest_edge`.
+                    - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                        aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                        `max_width`.
+            interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
+                Resampling filter to use if resizing the image.
+        """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
+        if size.shortest_edge and size.longest_edge:
+            # Resize the image so that the shortest edge or the longest edge is of the given size
+            # while maintaining the aspect ratio of the original image.
+            new_size = get_size_with_aspect_ratio(
+                image.size()[-2:],
+                size["shortest_edge"],
+                size["longest_edge"],
+            )
+        elif size.max_height and size.max_width:
+            new_size = get_image_size_for_max_height_width(image.size()[-2:], size["max_height"], size["max_width"])
+        elif size.height and size.width:
+            new_size = (size["height"], size["width"])
+        else:
+            raise ValueError(
+                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
+                f" {size.keys()}."
+            )
+
+        image = F.resize(
+            image,
+            size=new_size,
+            interpolation=interpolation,
+            **kwargs,
+        )
+        return image
+
+    def resize_annotation(
+        self,
+        annotation: dict[str, Any],
+        orig_size: tuple[int, int],
+        target_size: tuple[int, int],
+        threshold: float = 0.5,
+        interpolation: Optional["F.InterpolationMode"] = None,
+    ):
+        """
+        Resizes an annotation to a target size.
+
+        Args:
+            annotation (`dict[str, Any]`):
+                The annotation dictionary.
+            orig_size (`tuple[int, int]`):
+                The original size of the input image.
+            target_size (`tuple[int, int]`):
+                The target size of the image, as returned by the preprocessing `resize` step.
+            threshold (`float`, *optional*, defaults to 0.5):
+                The threshold used to binarize the segmentation masks.
+            resample (`InterpolationMode`, defaults to `F.InterpolationMode.NEAREST_EXACT`):
+                The resampling filter to use when resizing the masks.
+        """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST_EXACT
+        ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)]
+
+        new_annotation = {}
+        new_annotation["size"] = target_size
+
+        for key, value in annotation.items():
+            if key == "boxes":
+                boxes = value
+                scaled_boxes = boxes * torch.as_tensor(
+                    [ratio_width, ratio_height, ratio_width, ratio_height], dtype=torch.float32, device=boxes.device
+                )
+                new_annotation["boxes"] = scaled_boxes
+            elif key == "area":
+                area = value
+                scaled_area = area * (ratio_width * ratio_height)
+                new_annotation["area"] = scaled_area
+            elif key == "masks":
+                masks = value[:, None]
+                masks = [F.resize(mask, target_size, interpolation=interpolation) for mask in masks]
+                masks = torch.stack(masks).to(torch.float32)
+                masks = masks[:, 0] > threshold
+                new_annotation["masks"] = masks
+            elif key == "size":
+                new_annotation["size"] = target_size
+            else:
+                new_annotation[key] = value
+
+        return new_annotation
+
+    def normalize_annotation(self, annotation: dict, image_size: tuple[int, int]) -> dict:
+        image_height, image_width = image_size
+        norm_annotation = {}
+        for key, value in annotation.items():
+            if key == "boxes":
+                boxes = value
+                boxes = corners_to_center_format(boxes)
+                boxes /= torch.as_tensor(
+                    [image_width, image_height, image_width, image_height], dtype=torch.float32, device=boxes.device
+                )
+                norm_annotation[key] = boxes
+            else:
+                norm_annotation[key] = value
+        return norm_annotation
+
+    def _update_annotation_for_padded_image(
+        self,
+        annotation: dict,
+        input_image_size: tuple[int, int],
+        output_image_size: tuple[int, int],
+        padding,
+        update_bboxes,
+    ) -> dict:
+        """
+        Update the annotation for a padded image.
+        """
+        new_annotation = {}
+        new_annotation["size"] = output_image_size
+        ratio_height, ratio_width = (input / output for output, input in zip(output_image_size, input_image_size))
+
+        for key, value in annotation.items():
+            if key == "masks":
+                masks = value
+                masks = F.pad(
+                    masks,
+                    padding,
+                    fill=0,
+                )
+                masks = safe_squeeze(masks, 1)
+                new_annotation["masks"] = masks
+            elif key == "boxes" and update_bboxes:
+                boxes = value
+                boxes *= torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height], device=boxes.device)
+                new_annotation["boxes"] = boxes
+            elif key == "size":
+                new_annotation["size"] = output_image_size
+            else:
+                new_annotation[key] = value
+        return new_annotation
+
+    def pad(
+        self,
+        image: torch.Tensor,
+        padded_size: tuple[int, int],
+        annotation: Optional[dict[str, Any]] = None,
+        update_bboxes: bool = True,
+        fill: int = 0,
+    ):
+        original_size = image.size()[-2:]
+        padding_bottom = padded_size[0] - original_size[0]
+        padding_right = padded_size[1] - original_size[1]
+        if padding_bottom < 0 or padding_right < 0:
+            raise ValueError(
+                f"Padding dimensions are negative. Please make sure that the padded size is larger than the "
+                f"original size. Got padded size: {padded_size}, original size: {original_size}."
+            )
+        if original_size != padded_size:
+            padding = [0, 0, padding_right, padding_bottom]
+            image = F.pad(image, padding, fill=fill)
+            if annotation is not None:
+                annotation = self._update_annotation_for_padded_image(
+                    annotation, original_size, padded_size, padding, update_bboxes
+                )
+
+        # Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+        pixel_mask = torch.zeros(padded_size, dtype=torch.int64, device=image.device)
+        pixel_mask[: original_size[0], : original_size[1]] = 1
+
+        return image, pixel_mask, annotation
+
+    @auto_docstring
+    def preprocess(
+        self,
+        images: ImageInput,
+        annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        **kwargs: Unpack[RTDetrFastImageProcessorKwargs],
+    ) -> BatchFeature:
+        r"""
+        annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
+            List of annotations associated with the image or batch of images. If annotation is for object
+            detection, the annotations should be a dictionary with the following keys:
+            - "image_id" (`int`): The image id.
+            - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a
+                dictionary. An image can have no annotations, in which case the list should be empty.
+            If annotation is for segmentation, the annotations should be a dictionary with the following keys:
+            - "image_id" (`int`): The image id.
+            - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary.
+                An image can have no segments, in which case the list should be empty.
+            - "file_name" (`str`): The file name of the image.
+        masks_path (`str` or `pathlib.Path`, *optional*):
+            Path to the directory containing the segmentation masks.
+        """
+        return super().preprocess(images, annotations, masks_path, **kwargs)
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        annotations: Optional[Union[AnnotationType, list[AnnotationType]]],
+        masks_path: Optional[Union[str, pathlib.Path]],
+        return_segmentation_masks: bool,
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        do_convert_annotations: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        do_pad: bool,
+        pad_size: Optional[SizeDict],
+        format: Optional[Union[str, AnnotationFormat]],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess an image or a batch of images so that it can be used by the model.
+        """
+
+        if annotations is not None and isinstance(annotations, dict):
+            annotations = [annotations]
+
+        if annotations is not None and len(images) != len(annotations):
+            raise ValueError(
+                f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
+            )
+
+        format = AnnotationFormat(format)
+        if annotations is not None:
+            validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
+
+        data = {}
+        processed_images = []
+        processed_annotations = []
+        pixel_masks = []  # Initialize pixel_masks here
+        for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
+            # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
+            if annotations is not None:
+                annotation = self.prepare_annotation(
+                    image,
+                    annotation,
+                    format,
+                    return_segmentation_masks=return_segmentation_masks,
+                    masks_path=masks_path,
+                    input_data_format=ChannelDimension.FIRST,
+                )
+
+            if do_resize:
+                resized_image = self.resize(image, size=size, interpolation=interpolation)
+                if annotations is not None:
+                    annotation = self.resize_annotation(
+                        annotation,
+                        orig_size=image.size()[-2:],
+                        target_size=resized_image.size()[-2:],
+                    )
+                image = resized_image
+            # Fused rescale and normalize
+            image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std)
+            if do_convert_annotations and annotations is not None:
+                annotation = self.normalize_annotation(annotation, get_image_size(image, ChannelDimension.FIRST))
+
+            processed_images.append(image)
+            processed_annotations.append(annotation)
+        images = processed_images
+        annotations = processed_annotations if annotations is not None else None
+
+        if do_pad:
+            # depends on all resized image shapes so we need another loop
+            if pad_size is not None:
+                padded_size = (pad_size.height, pad_size.width)
+            else:
+                padded_size = get_max_height_width(images)
+
+            padded_images = []
+            padded_annotations = []
+            for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
+                # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
+                if padded_size == image.size()[-2:]:
+                    padded_images.append(image)
+                    pixel_masks.append(torch.ones(padded_size, dtype=torch.int64, device=image.device))
+                    padded_annotations.append(annotation)
+                    continue
+                image, pixel_mask, annotation = self.pad(
+                    image, padded_size, annotation=annotation, update_bboxes=do_convert_annotations
+                )
+                padded_images.append(image)
+                padded_annotations.append(annotation)
+                pixel_masks.append(pixel_mask)
+            images = padded_images
+            annotations = padded_annotations if annotations is not None else None
+            data.update({"pixel_mask": torch.stack(pixel_masks, dim=0)})
+
+        data.update({"pixel_values": torch.stack(images, dim=0)})
+        encoded_inputs = BatchFeature(data, tensor_type=return_tensors)
+        if annotations is not None:
+            encoded_inputs["labels"] = [
+                BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
+            ]
+        return encoded_inputs
+
+    def post_process_object_detection(
+        self,
+        outputs,
+        threshold: float = 0.5,
+        target_sizes: Union[TensorType, list[tuple]] = None,
+        use_focal_loss: bool = True,
+    ):
+        """
+        Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`DetrObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*, defaults to 0.5):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size
+                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
+            use_focal_loss (`bool` defaults to `True`):
+                Variable informing if the focal loss was used to predict the outputs. If `True`, a sigmoid is applied
+                to compute the scores of each detection, otherwise, a softmax function is used.
+
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        requires_backends(self, ["torch"])
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+        # convert from relative cxcywh to absolute xyxy
+        boxes = center_to_corners_format(out_bbox)
+        if target_sizes is not None:
+            if len(out_logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+            if isinstance(target_sizes, list):
+                img_h, img_w = torch.as_tensor(target_sizes).unbind(1)
+            else:
+                img_h, img_w = target_sizes.unbind(1)
+            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+            boxes = boxes * scale_fct[:, None, :]
+
+        num_top_queries = out_logits.shape[1]
+        num_classes = out_logits.shape[2]
+
+        if use_focal_loss:
+            scores = torch.nn.functional.sigmoid(out_logits)
+            scores, index = torch.topk(scores.flatten(1), num_top_queries, axis=-1)
+            labels = index % num_classes
+            index = index // num_classes
+            boxes = boxes.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, boxes.shape[-1]))
+        else:
+            scores = torch.nn.functional.softmax(out_logits)[:, :, :-1]
+            scores, labels = scores.max(dim=-1)
+            if scores.shape[1] > num_top_queries:
+                scores, index = torch.topk(scores, num_top_queries, dim=-1)
+                labels = torch.gather(labels, dim=1, index=index)
+                boxes = torch.gather(boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1]))
+
+        results = []
+        for score, label, box in zip(scores, labels, boxes):
+            results.append(
+                {
+                    "scores": score[score > threshold],
+                    "labels": label[score > threshold],
+                    "boxes": box[score > threshold],
+                }
+            )
+
+        return results
+
+
+__all__ = ["RTDetrImageProcessorFast"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/rt_detr/modeling_rt_detr.py b/venv/lib/python3.13/site-packages/transformers/models/rt_detr/modeling_rt_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d4b64496969acb596b037841c745b109f3d11b5
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/rt_detr/modeling_rt_detr.py
@@ -0,0 +1,2013 @@
+# coding=utf-8
+# Copyright 2024 Baidu Inc and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch RT-DETR model."""
+
+import math
+import warnings
+from dataclasses import dataclass
+from functools import partial
+from typing import Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+from ...activations import ACT2CLS, ACT2FN
+from ...image_transforms import center_to_corners_format, corners_to_center_format
+from ...integrations import use_kernel_forward_from_hub
+from ...modeling_outputs import BaseModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import compile_compatible_method_lru_cache
+from ...utils import (
+    ModelOutput,
+    auto_docstring,
+    logging,
+    torch_int,
+)
+from ...utils.backbone_utils import load_backbone
+from .configuration_rt_detr import RTDetrConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# TODO: Replace all occurrences of the checkpoint with the final one
+
+
+@use_kernel_forward_from_hub("MultiScaleDeformableAttention")
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.MultiScaleDeformableAttention
+class MultiScaleDeformableAttention(nn.Module):
+    def forward(
+        self,
+        value: Tensor,
+        value_spatial_shapes: Tensor,
+        value_spatial_shapes_list: list[tuple],
+        level_start_index: Tensor,
+        sampling_locations: Tensor,
+        attention_weights: Tensor,
+        im2col_step: int,
+    ):
+        batch_size, _, num_heads, hidden_dim = value.shape
+        _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
+        value_list = value.split([height * width for height, width in value_spatial_shapes_list], dim=1)
+        sampling_grids = 2 * sampling_locations - 1
+        sampling_value_list = []
+        for level_id, (height, width) in enumerate(value_spatial_shapes_list):
+            # batch_size, height*width, num_heads, hidden_dim
+            # -> batch_size, height*width, num_heads*hidden_dim
+            # -> batch_size, num_heads*hidden_dim, height*width
+            # -> batch_size*num_heads, hidden_dim, height, width
+            value_l_ = (
+                value_list[level_id]
+                .flatten(2)
+                .transpose(1, 2)
+                .reshape(batch_size * num_heads, hidden_dim, height, width)
+            )
+            # batch_size, num_queries, num_heads, num_points, 2
+            # -> batch_size, num_heads, num_queries, num_points, 2
+            # -> batch_size*num_heads, num_queries, num_points, 2
+            sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
+            # batch_size*num_heads, hidden_dim, num_queries, num_points
+            sampling_value_l_ = nn.functional.grid_sample(
+                value_l_,
+                sampling_grid_l_,
+                mode="bilinear",
+                padding_mode="zeros",
+                align_corners=False,
+            )
+            sampling_value_list.append(sampling_value_l_)
+        # (batch_size, num_queries, num_heads, num_levels, num_points)
+        # -> (batch_size, num_heads, num_queries, num_levels, num_points)
+        # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points)
+        attention_weights = attention_weights.transpose(1, 2).reshape(
+            batch_size * num_heads, 1, num_queries, num_levels * num_points
+        )
+        output = (
+            (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
+            .sum(-1)
+            .view(batch_size, num_heads * hidden_dim, num_queries)
+        )
+        return output.transpose(1, 2).contiguous()
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for outputs of the RTDetrDecoder. This class adds two attributes to
+    BaseModelOutputWithCrossAttentions, namely:
+    - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer)
+    - a stacked tensor of intermediate reference points.
+    """
+)
+class RTDetrDecoderOutput(ModelOutput):
+    r"""
+    intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+        Stacked intermediate hidden states (output of each layer of the decoder).
+    intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, config.num_labels)`):
+        Stacked intermediate logits (logits of each layer of the decoder).
+    intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
+        Stacked intermediate reference points (reference points of each layer of the decoder).
+    intermediate_predicted_corners (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+        Stacked intermediate predicted corners (predicted corners of each layer of the decoder).
+    initial_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+        Stacked initial reference points (initial reference points of each layer of the decoder).
+    cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+        sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+        used to compute the weighted average in the cross-attention heads.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_logits: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
+    intermediate_predicted_corners: Optional[torch.FloatTensor] = None
+    initial_reference_points: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for outputs of the RT-DETR encoder-decoder model.
+    """
+)
+class RTDetrModelOutput(ModelOutput):
+    r"""
+    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+        Sequence of hidden-states at the output of the last layer of the decoder of the model.
+    intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+        Stacked intermediate hidden states (output of each layer of the decoder).
+    intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, config.num_labels)`):
+        Stacked intermediate logits (logits of each layer of the decoder).
+    intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+        Stacked intermediate reference points (reference points of each layer of the decoder).
+    intermediate_predicted_corners (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+        Stacked intermediate predicted corners (predicted corners of each layer of the decoder).
+    initial_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+        Initial reference points used for the first decoder layer.
+    init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+        Initial reference points sent through the Transformer decoder.
+    enc_topk_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
+        Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
+        picked as region proposals in the encoder stage. Output of bounding box binary classification (i.e.
+        foreground and background).
+    enc_topk_bboxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`):
+        Logits of predicted bounding boxes coordinates in the encoder stage.
+    enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+        Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
+        picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
+        foreground and background).
+    enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+        Logits of predicted bounding boxes coordinates in the first stage.
+    denoising_meta_values (`dict`):
+        Extra dictionary for the denoising related values.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_logits: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
+    intermediate_predicted_corners: Optional[torch.FloatTensor] = None
+    initial_reference_points: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    init_reference_points: Optional[torch.FloatTensor] = None
+    enc_topk_logits: Optional[torch.FloatTensor] = None
+    enc_topk_bboxes: Optional[torch.FloatTensor] = None
+    enc_outputs_class: Optional[torch.FloatTensor] = None
+    enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
+    denoising_meta_values: Optional[dict] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Output type of [`RTDetrForObjectDetection`].
+    """
+)
+class RTDetrObjectDetectionOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
+        Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+        bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+        scale-invariant IoU loss.
+    loss_dict (`Dict`, *optional*):
+        A dictionary containing the individual losses. Useful for logging.
+    logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
+        Classification logits (including no-object) for all queries.
+    pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+        values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+        possible padding). You can use [`~RTDetrImageProcessor.post_process_object_detection`] to retrieve the
+        unnormalized (absolute) bounding boxes.
+    auxiliary_outputs (`list[Dict]`, *optional*):
+        Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+        and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
+        `pred_boxes`) for each decoder layer.
+    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+        Sequence of hidden-states at the output of the last layer of the decoder of the model.
+    intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+        Stacked intermediate hidden states (output of each layer of the decoder).
+    intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, config.num_labels)`):
+        Stacked intermediate logits (logits of each layer of the decoder).
+    intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+        Stacked intermediate reference points (reference points of each layer of the decoder).
+    intermediate_predicted_corners (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+        Stacked intermediate predicted corners (predicted corners of each layer of the decoder).
+    initial_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+        Stacked initial reference points (initial reference points of each layer of the decoder).
+    init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
+        Initial reference points sent through the Transformer decoder.
+    enc_topk_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+        Logits of predicted bounding boxes coordinates in the encoder.
+    enc_topk_bboxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+        Logits of predicted bounding boxes coordinates in the encoder.
+    enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+        Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
+        picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
+        foreground and background).
+    enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+        Logits of predicted bounding boxes coordinates in the first stage.
+    denoising_meta_values (`dict`):
+        Extra dictionary for the denoising related values
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    loss_dict: Optional[dict] = None
+    logits: Optional[torch.FloatTensor] = None
+    pred_boxes: Optional[torch.FloatTensor] = None
+    auxiliary_outputs: Optional[list[dict]] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_logits: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
+    intermediate_predicted_corners: Optional[torch.FloatTensor] = None
+    initial_reference_points: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    init_reference_points: Optional[tuple[torch.FloatTensor]] = None
+    enc_topk_logits: Optional[torch.FloatTensor] = None
+    enc_topk_bboxes: Optional[torch.FloatTensor] = None
+    enc_outputs_class: Optional[torch.FloatTensor] = None
+    enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
+    denoising_meta_values: Optional[dict] = None
+
+
+def _get_clones(partial_module, N):
+    return nn.ModuleList([partial_module() for i in range(N)])
+
+
+# Copied from transformers.models.conditional_detr.modeling_conditional_detr.inverse_sigmoid
+def inverse_sigmoid(x, eps=1e-5):
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->RTDetr
+class RTDetrFrozenBatchNorm2d(nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
+    torchvision.models.resnet[18,34,50,101] produce nans.
+    """
+
+    def __init__(self, n):
+        super().__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it user-friendly
+        weight = self.weight.reshape(1, -1, 1, 1)
+        bias = self.bias.reshape(1, -1, 1, 1)
+        running_var = self.running_var.reshape(1, -1, 1, 1)
+        running_mean = self.running_mean.reshape(1, -1, 1, 1)
+        epsilon = 1e-5
+        scale = weight * (running_var + epsilon).rsqrt()
+        bias = bias - running_mean * scale
+        return x * scale + bias
+
+
+# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->RTDetr
+def replace_batch_norm(model):
+    r"""
+    Recursively replace all `torch.nn.BatchNorm2d` with `RTDetrFrozenBatchNorm2d`.
+
+    Args:
+        model (torch.nn.Module):
+            input model
+    """
+    for name, module in model.named_children():
+        if isinstance(module, nn.BatchNorm2d):
+            new_module = RTDetrFrozenBatchNorm2d(module.num_features)
+
+            if module.weight.device != torch.device("meta"):
+                new_module.weight.data.copy_(module.weight)
+                new_module.bias.data.copy_(module.bias)
+                new_module.running_mean.data.copy_(module.running_mean)
+                new_module.running_var.data.copy_(module.running_var)
+
+            model._modules[name] = new_module
+
+        if len(list(module.children())) > 0:
+            replace_batch_norm(module)
+
+
+def get_contrastive_denoising_training_group(
+    targets,
+    num_classes,
+    num_queries,
+    class_embed,
+    num_denoising_queries=100,
+    label_noise_ratio=0.5,
+    box_noise_scale=1.0,
+):
+    """
+    Creates a contrastive denoising training group using ground-truth samples. It adds noise to labels and boxes.
+
+    Args:
+        targets (`list[dict]`):
+            The target objects, each containing 'class_labels' and 'boxes' for objects in an image.
+        num_classes (`int`):
+            Total number of classes in the dataset.
+        num_queries (`int`):
+            Number of query slots in the transformer.
+        class_embed (`callable`):
+            A function or a model layer to embed class labels.
+        num_denoising_queries (`int`, *optional*, defaults to 100):
+            Number of denoising queries.
+        label_noise_ratio (`float`, *optional*, defaults to 0.5):
+            Ratio of noise applied to labels.
+        box_noise_scale (`float`, *optional*, defaults to 1.0):
+            Scale of noise applied to bounding boxes.
+    Returns:
+        `tuple` comprising various elements:
+        - **input_query_class** (`torch.FloatTensor`) --
+          Class queries with applied label noise.
+        - **input_query_bbox** (`torch.FloatTensor`) --
+          Bounding box queries with applied box noise.
+        - **attn_mask** (`torch.FloatTensor`) --
+           Attention mask for separating denoising and reconstruction queries.
+        - **denoising_meta_values** (`dict`) --
+          Metadata including denoising positive indices, number of groups, and split sizes.
+    """
+
+    if num_denoising_queries <= 0:
+        return None, None, None, None
+
+    num_ground_truths = [len(t["class_labels"]) for t in targets]
+    device = targets[0]["class_labels"].device
+
+    max_gt_num = max(num_ground_truths)
+    if max_gt_num == 0:
+        return None, None, None, None
+
+    num_groups_denoising_queries = num_denoising_queries // max_gt_num
+    num_groups_denoising_queries = 1 if num_groups_denoising_queries == 0 else num_groups_denoising_queries
+    # pad gt to max_num of a batch
+    batch_size = len(num_ground_truths)
+
+    input_query_class = torch.full([batch_size, max_gt_num], num_classes, dtype=torch.int32, device=device)
+    input_query_bbox = torch.zeros([batch_size, max_gt_num, 4], device=device)
+    pad_gt_mask = torch.zeros([batch_size, max_gt_num], dtype=torch.bool, device=device)
+
+    for i in range(batch_size):
+        num_gt = num_ground_truths[i]
+        if num_gt > 0:
+            input_query_class[i, :num_gt] = targets[i]["class_labels"]
+            input_query_bbox[i, :num_gt] = targets[i]["boxes"]
+            pad_gt_mask[i, :num_gt] = 1
+    # each group has positive and negative queries.
+    input_query_class = input_query_class.tile([1, 2 * num_groups_denoising_queries])
+    input_query_bbox = input_query_bbox.tile([1, 2 * num_groups_denoising_queries, 1])
+    pad_gt_mask = pad_gt_mask.tile([1, 2 * num_groups_denoising_queries])
+    # positive and negative mask
+    negative_gt_mask = torch.zeros([batch_size, max_gt_num * 2, 1], device=device)
+    negative_gt_mask[:, max_gt_num:] = 1
+    negative_gt_mask = negative_gt_mask.tile([1, num_groups_denoising_queries, 1])
+    positive_gt_mask = 1 - negative_gt_mask
+    # contrastive denoising training positive index
+    positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask
+    denoise_positive_idx = torch.nonzero(positive_gt_mask)[:, 1]
+    denoise_positive_idx = torch.split(
+        denoise_positive_idx, [n * num_groups_denoising_queries for n in num_ground_truths]
+    )
+    # total denoising queries
+    num_denoising_queries = torch_int(max_gt_num * 2 * num_groups_denoising_queries)
+
+    if label_noise_ratio > 0:
+        mask = torch.rand_like(input_query_class, dtype=torch.float) < (label_noise_ratio * 0.5)
+        # randomly put a new one here
+        new_label = torch.randint_like(mask, 0, num_classes, dtype=input_query_class.dtype)
+        input_query_class = torch.where(mask & pad_gt_mask, new_label, input_query_class)
+
+    if box_noise_scale > 0:
+        known_bbox = center_to_corners_format(input_query_bbox)
+        diff = torch.tile(input_query_bbox[..., 2:] * 0.5, [1, 1, 2]) * box_noise_scale
+        rand_sign = torch.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0
+        rand_part = torch.rand_like(input_query_bbox)
+        rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * (1 - negative_gt_mask)
+        rand_part *= rand_sign
+        known_bbox += rand_part * diff
+        known_bbox.clip_(min=0.0, max=1.0)
+        input_query_bbox = corners_to_center_format(known_bbox)
+        input_query_bbox = inverse_sigmoid(input_query_bbox)
+
+    input_query_class = class_embed(input_query_class)
+
+    target_size = num_denoising_queries + num_queries
+    attn_mask = torch.full([target_size, target_size], 0, dtype=torch.float, device=device)
+    # match query cannot see the reconstruction
+    attn_mask[num_denoising_queries:, :num_denoising_queries] = -torch.inf
+
+    # reconstructions cannot see each other
+    for i in range(num_groups_denoising_queries):
+        idx_block_start = max_gt_num * 2 * i
+        idx_block_end = max_gt_num * 2 * (i + 1)
+        attn_mask[idx_block_start:idx_block_end, :idx_block_start] = -torch.inf
+        attn_mask[idx_block_start:idx_block_end, idx_block_end:num_denoising_queries] = -torch.inf
+
+    denoising_meta_values = {
+        "dn_positive_idx": denoise_positive_idx,
+        "dn_num_group": num_groups_denoising_queries,
+        "dn_num_split": [num_denoising_queries, num_queries],
+    }
+
+    return input_query_class, input_query_bbox, attn_mask, denoising_meta_values
+
+
+class RTDetrConvEncoder(nn.Module):
+    """
+    Convolutional backbone using the modeling_rt_detr_resnet.py.
+
+    nn.BatchNorm2d layers are replaced by RTDetrFrozenBatchNorm2d as defined above.
+    https://github.com/lyuwenyu/RT-DETR/blob/main/rtdetr_pytorch/src/nn/backbone/presnet.py#L142
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        backbone = load_backbone(config)
+
+        if config.freeze_backbone_batch_norms:
+            # replace batch norm by frozen batch norm
+            with torch.no_grad():
+                replace_batch_norm(backbone)
+        self.model = backbone
+        self.intermediate_channel_sizes = self.model.channels
+
+    def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
+        # send pixel_values through the model to get list of feature maps
+        features = self.model(pixel_values).feature_maps
+
+        out = []
+        for feature_map in features:
+            # downsample pixel_mask to match shape of corresponding feature_map
+            mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
+            out.append((feature_map, mask))
+        return out
+
+
+class RTDetrConvNormLayer(nn.Module):
+    def __init__(self, config, in_channels, out_channels, kernel_size, stride, padding=None, activation=None):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding=(kernel_size - 1) // 2 if padding is None else padding,
+            bias=False,
+        )
+        self.norm = nn.BatchNorm2d(out_channels, config.batch_norm_eps)
+        self.activation = nn.Identity() if activation is None else ACT2CLS[activation]()
+
+    def forward(self, hidden_state):
+        hidden_state = self.conv(hidden_state)
+        hidden_state = self.norm(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        return hidden_state
+
+
+class RTDetrEncoderLayer(nn.Module):
+    def __init__(self, config: RTDetrConfig):
+        super().__init__()
+        self.normalize_before = config.normalize_before
+
+        # self-attention
+        self.self_attn = RTDetrMultiheadAttention(
+            embed_dim=config.encoder_hidden_dim,
+            num_heads=config.num_attention_heads,
+            dropout=config.dropout,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(config.encoder_hidden_dim, eps=config.layer_norm_eps)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.encoder_activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(config.encoder_hidden_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, config.encoder_hidden_dim)
+        self.final_layer_norm = nn.LayerNorm(config.encoder_hidden_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        **kwargs,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
+                values.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                Object queries (also called content embeddings), to be added to the hidden states.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        if self.normalize_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        if not self.normalize_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        if self.normalize_before:
+            hidden_states = self.final_layer_norm(hidden_states)
+        residual = hidden_states
+
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+
+        hidden_states = self.fc2(hidden_states)
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        hidden_states = residual + hidden_states
+        if not self.normalize_before:
+            hidden_states = self.final_layer_norm(hidden_states)
+
+        if self.training:
+            if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class RTDetrRepVggBlock(nn.Module):
+    """
+    RepVGG architecture block introduced by the work "RepVGG: Making VGG-style ConvNets Great Again".
+    """
+
+    def __init__(self, config: RTDetrConfig):
+        super().__init__()
+
+        activation = config.activation_function
+        hidden_channels = int(config.encoder_hidden_dim * config.hidden_expansion)
+        self.conv1 = RTDetrConvNormLayer(config, hidden_channels, hidden_channels, 3, 1, padding=1)
+        self.conv2 = RTDetrConvNormLayer(config, hidden_channels, hidden_channels, 1, 1, padding=0)
+        self.activation = nn.Identity() if activation is None else ACT2CLS[activation]()
+
+    def forward(self, x):
+        y = self.conv1(x) + self.conv2(x)
+        return self.activation(y)
+
+
+class RTDetrCSPRepLayer(nn.Module):
+    """
+    Cross Stage Partial (CSP) network layer with RepVGG blocks.
+    """
+
+    def __init__(self, config: RTDetrConfig):
+        super().__init__()
+
+        in_channels = config.encoder_hidden_dim * 2
+        out_channels = config.encoder_hidden_dim
+        num_blocks = 3
+        activation = config.activation_function
+
+        hidden_channels = int(out_channels * config.hidden_expansion)
+        self.conv1 = RTDetrConvNormLayer(config, in_channels, hidden_channels, 1, 1, activation=activation)
+        self.conv2 = RTDetrConvNormLayer(config, in_channels, hidden_channels, 1, 1, activation=activation)
+        self.bottlenecks = nn.Sequential(*[RTDetrRepVggBlock(config) for _ in range(num_blocks)])
+        if hidden_channels != out_channels:
+            self.conv3 = RTDetrConvNormLayer(config, hidden_channels, out_channels, 1, 1, activation=activation)
+        else:
+            self.conv3 = nn.Identity()
+
+    def forward(self, hidden_state):
+        hidden_state_1 = self.conv1(hidden_state)
+        hidden_state_1 = self.bottlenecks(hidden_state_1)
+        hidden_state_2 = self.conv2(hidden_state)
+        return self.conv3(hidden_state_1 + hidden_state_2)
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->RTDetr
+class RTDetrMultiscaleDeformableAttention(nn.Module):
+    """
+    Multiscale deformable attention as proposed in Deformable DETR.
+    """
+
+    def __init__(self, config: RTDetrConfig, num_heads: int, n_points: int):
+        super().__init__()
+
+        self.attn = MultiScaleDeformableAttention()
+
+        if config.d_model % num_heads != 0:
+            raise ValueError(
+                f"embed_dim (d_model) must be divisible by num_heads, but got {config.d_model} and {num_heads}"
+            )
+        dim_per_head = config.d_model // num_heads
+        # check if dim_per_head is power of 2
+        if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0):
+            warnings.warn(
+                "You'd better set embed_dim (d_model) in RTDetrMultiscaleDeformableAttention to make the"
+                " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA"
+                " implementation."
+            )
+
+        self.im2col_step = 64
+
+        self.d_model = config.d_model
+        self.n_levels = config.num_feature_levels
+        self.n_heads = num_heads
+        self.n_points = n_points
+
+        self.sampling_offsets = nn.Linear(config.d_model, num_heads * self.n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(config.d_model, num_heads * self.n_levels * n_points)
+        self.value_proj = nn.Linear(config.d_model, config.d_model)
+        self.output_proj = nn.Linear(config.d_model, config.d_model)
+
+        self.disable_custom_kernels = config.disable_custom_kernels
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        reference_points=None,
+        spatial_shapes=None,
+        spatial_shapes_list=None,
+        level_start_index=None,
+        output_attentions: bool = False,
+    ):
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if position_embeddings is not None:
+            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+        batch_size, num_queries, _ = hidden_states.shape
+        batch_size, sequence_length, _ = encoder_hidden_states.shape
+        total_elements = sum(height * width for height, width in spatial_shapes_list)
+        if total_elements != sequence_length:
+            raise ValueError(
+                "Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
+            )
+
+        value = self.value_proj(encoder_hidden_states)
+        if attention_mask is not None:
+            # we invert the attention_mask
+            value = value.masked_fill(~attention_mask[..., None], float(0))
+        value = value.view(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads)
+        sampling_offsets = self.sampling_offsets(hidden_states).view(
+            batch_size, num_queries, self.n_heads, self.n_levels, self.n_points, 2
+        )
+        attention_weights = self.attention_weights(hidden_states).view(
+            batch_size, num_queries, self.n_heads, self.n_levels * self.n_points
+        )
+        attention_weights = F.softmax(attention_weights, -1).view(
+            batch_size, num_queries, self.n_heads, self.n_levels, self.n_points
+        )
+        # batch_size, num_queries, n_heads, n_levels, n_points, 2
+        num_coordinates = reference_points.shape[-1]
+        if num_coordinates == 2:
+            offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :]
+                + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+            )
+        elif num_coordinates == 4:
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :2]
+                + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+            )
+        else:
+            raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
+
+        output = self.attn(
+            value,
+            spatial_shapes,
+            spatial_shapes_list,
+            level_start_index,
+            sampling_locations,
+            attention_weights,
+            self.im2col_step,
+        )
+
+        output = self.output_proj(output)
+
+        return output, attention_weights
+
+
+class RTDetrMultiheadAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper.
+
+    Here, we add position embeddings to the queries and keys (as explained in the Deformable DETR paper).
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        if self.head_dim * num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _reshape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
+        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, target_len, embed_dim = hidden_states.size()
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if position_embeddings is not None:
+            hidden_states_original = hidden_states
+            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+        # get queries, keys and values
+        query_states = self.q_proj(hidden_states) * self.scaling
+        key_states = self._reshape(self.k_proj(hidden_states), -1, batch_size)
+        value_states = self._reshape(self.v_proj(hidden_states_original), -1, batch_size)
+
+        proj_shape = (batch_size * self.num_heads, -1, self.head_dim)
+        query_states = self._reshape(query_states, target_len, batch_size).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        source_len = key_states.size(1)
+
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [seq_len, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len]
+            attention_mask = attention_mask.expand(batch_size, 1, *attention_mask.size())
+
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, target_len, source_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is"
+                    f" {attention_mask.size()}"
+                )
+            if attention_mask.dtype == torch.bool:
+                attention_mask = torch.zeros_like(attention_mask, dtype=attn_weights.dtype).masked_fill_(
+                    attention_mask, -torch.inf
+                )
+            attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask
+            attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len)
+            attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(batch_size, target_len, embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+class RTDetrDecoderLayer(nn.Module):
+    def __init__(self, config: RTDetrConfig):
+        super().__init__()
+        # self-attention
+        self.self_attn = RTDetrMultiheadAttention(
+            embed_dim=config.d_model,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.decoder_activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
+        # cross-attention
+        self.encoder_attn = RTDetrMultiscaleDeformableAttention(
+            config,
+            num_heads=config.decoder_attention_heads,
+            n_points=config.decoder_n_points,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
+        # feedforward neural networks
+        self.fc1 = nn.Linear(config.d_model, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, config.d_model)
+        self.final_layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Optional[torch.Tensor] = None,
+        reference_points=None,
+        spatial_shapes=None,
+        spatial_shapes_list=None,
+        level_start_index=None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(seq_len, batch, embed_dim)`.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                Position embeddings that are added to the queries and keys in the self-attention layer.
+            reference_points (`torch.FloatTensor`, *optional*):
+                Reference points.
+            spatial_shapes (`torch.LongTensor`, *optional*):
+                Spatial shapes.
+            level_start_index (`torch.LongTensor`, *optional*):
+                Level start index.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
+                values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=encoder_attention_mask,
+            position_embeddings=position_embeddings,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        second_residual = hidden_states
+
+        # Cross-Attention
+        cross_attn_weights = None
+        hidden_states, cross_attn_weights = self.encoder_attn(
+            hidden_states=hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            position_embeddings=position_embeddings,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            spatial_shapes_list=spatial_shapes_list,
+            level_start_index=level_start_index,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = second_residual + hidden_states
+
+        hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+@auto_docstring
+class RTDetrPreTrainedModel(PreTrainedModel):
+    config: RTDetrConfig
+    base_model_prefix = "rt_detr"
+    main_input_name = "pixel_values"
+    _no_split_modules = [r"RTDetrHybridEncoder", r"RTDetrDecoderLayer"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (RTDetrForObjectDetection, RTDetrDecoder)):
+            if module.class_embed is not None:
+                for layer in module.class_embed:
+                    prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1)
+                    bias = float(-math.log((1 - prior_prob) / prior_prob))
+                    nn.init.xavier_uniform_(layer.weight)
+                    nn.init.constant_(layer.bias, bias)
+
+            if module.bbox_embed is not None:
+                for layer in module.bbox_embed:
+                    nn.init.constant_(layer.layers[-1].weight, 0)
+                    nn.init.constant_(layer.layers[-1].bias, 0)
+
+        elif isinstance(module, RTDetrMultiscaleDeformableAttention):
+            nn.init.constant_(module.sampling_offsets.weight.data, 0.0)
+            default_dtype = torch.get_default_dtype()
+            thetas = torch.arange(module.n_heads, dtype=torch.int64).to(default_dtype) * (
+                2.0 * math.pi / module.n_heads
+            )
+            grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+            grid_init = (
+                (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
+                .view(module.n_heads, 1, 1, 2)
+                .repeat(1, module.n_levels, module.n_points, 1)
+            )
+            for i in range(module.n_points):
+                grid_init[:, :, i, :] *= i + 1
+            with torch.no_grad():
+                module.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+            nn.init.constant_(module.attention_weights.weight.data, 0.0)
+            nn.init.constant_(module.attention_weights.bias.data, 0.0)
+            nn.init.xavier_uniform_(module.value_proj.weight.data)
+            nn.init.constant_(module.value_proj.bias.data, 0.0)
+            nn.init.xavier_uniform_(module.output_proj.weight.data)
+            nn.init.constant_(module.output_proj.bias.data, 0.0)
+
+        elif isinstance(module, RTDetrModel):
+            prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1)
+            bias = float(-math.log((1 - prior_prob) / prior_prob))
+            nn.init.xavier_uniform_(module.enc_score_head.weight)
+            nn.init.constant_(module.enc_score_head.bias, bias)
+
+        elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+        elif isinstance(module, nn.LayerNorm):
+            module.weight.data.fill_(1.0)
+            module.bias.data.zero_()
+
+        if hasattr(module, "weight_embedding") and self.config.learn_initial_query:
+            nn.init.xavier_uniform_(module.weight_embedding.weight)
+        if hasattr(module, "denoising_class_embed") and self.config.num_denoising > 0:
+            nn.init.xavier_uniform_(module.denoising_class_embed.weight)
+
+
+class RTDetrEncoder(nn.Module):
+    def __init__(self, config: RTDetrConfig):
+        super().__init__()
+
+        self.layers = nn.ModuleList([RTDetrEncoderLayer(config) for _ in range(config.encoder_layers)])
+
+    def forward(self, src, src_mask=None, pos_embed=None, output_attentions: bool = False) -> torch.Tensor:
+        hidden_states = src
+        for layer in self.layers:
+            hidden_states = layer(
+                hidden_states,
+                attention_mask=src_mask,
+                position_embeddings=pos_embed,
+                output_attentions=output_attentions,
+            )
+        return hidden_states
+
+
+class RTDetrHybridEncoder(nn.Module):
+    """
+    Decoder consisting of a projection layer, a set of `RTDetrEncoder`, a top-down Feature Pyramid Network
+    (FPN) and a bottom-up Path Aggregation Network (PAN). More details on the paper: https://huggingface.co/papers/2304.08069
+
+    Args:
+        config: RTDetrConfig
+    """
+
+    def __init__(self, config: RTDetrConfig):
+        super().__init__()
+        self.config = config
+        self.in_channels = config.encoder_in_channels
+        self.feat_strides = config.feat_strides
+        self.encoder_hidden_dim = config.encoder_hidden_dim
+        self.encode_proj_layers = config.encode_proj_layers
+        self.positional_encoding_temperature = config.positional_encoding_temperature
+        self.eval_size = config.eval_size
+        self.out_channels = [self.encoder_hidden_dim for _ in self.in_channels]
+        self.out_strides = self.feat_strides
+        self.num_fpn_stages = len(self.in_channels) - 1
+        self.num_pan_stages = len(self.in_channels) - 1
+        activation = config.activation_function
+
+        # encoder transformer
+        self.encoder = nn.ModuleList([RTDetrEncoder(config) for _ in range(len(self.encode_proj_layers))])
+
+        # top-down FPN
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_blocks = nn.ModuleList()
+        for _ in range(self.num_fpn_stages):
+            lateral_conv = RTDetrConvNormLayer(
+                config,
+                in_channels=self.encoder_hidden_dim,
+                out_channels=self.encoder_hidden_dim,
+                kernel_size=1,
+                stride=1,
+                activation=activation,
+            )
+            fpn_block = RTDetrCSPRepLayer(config)
+            self.lateral_convs.append(lateral_conv)
+            self.fpn_blocks.append(fpn_block)
+
+        # bottom-up PAN
+        self.downsample_convs = nn.ModuleList()
+        self.pan_blocks = nn.ModuleList()
+        for _ in range(self.num_pan_stages):
+            downsample_conv = RTDetrConvNormLayer(
+                config,
+                in_channels=self.encoder_hidden_dim,
+                out_channels=self.encoder_hidden_dim,
+                kernel_size=3,
+                stride=2,
+                activation=activation,
+            )
+            pan_block = RTDetrCSPRepLayer(config)
+            self.downsample_convs.append(downsample_conv)
+            self.pan_blocks.append(pan_block)
+
+    @staticmethod
+    def build_2d_sincos_position_embedding(
+        width, height, embed_dim=256, temperature=10000.0, device="cpu", dtype=torch.float32
+    ):
+        grid_w = torch.arange(torch_int(width), device=device).to(dtype)
+        grid_h = torch.arange(torch_int(height), device=device).to(dtype)
+        grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij")
+        if embed_dim % 4 != 0:
+            raise ValueError("Embed dimension must be divisible by 4 for 2D sin-cos position embedding")
+        pos_dim = embed_dim // 4
+        omega = torch.arange(pos_dim, device=device).to(dtype) / pos_dim
+        omega = 1.0 / (temperature**omega)
+
+        out_w = grid_w.flatten()[..., None] @ omega[None]
+        out_h = grid_h.flatten()[..., None] @ omega[None]
+
+        return torch.concat([out_w.sin(), out_w.cos(), out_h.sin(), out_h.cos()], dim=1)[None, :, :]
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        attention_mask=None,
+        position_embeddings=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        valid_ratios=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
+                - 1 for pixel features that are real (i.e. **not masked**),
+                - 0 for pixel features that are padding (i.e. **masked**).
+                [What are attention masks?](../glossary#attention-mask)
+            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+            spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of each feature map.
+            level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`):
+                Starting index of each feature map.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
+                Ratio of valid area in each feature level.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = inputs_embeds
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # encoder
+        if self.config.encoder_layers > 0:
+            for i, enc_ind in enumerate(self.encode_proj_layers):
+                if output_hidden_states:
+                    encoder_states = encoder_states + (hidden_states[enc_ind],)
+                height, width = hidden_states[enc_ind].shape[2:]
+                # flatten [batch, channel, height, width] to [batch, height*width, channel]
+                src_flatten = hidden_states[enc_ind].flatten(2).permute(0, 2, 1)
+                if self.training or self.eval_size is None:
+                    pos_embed = self.build_2d_sincos_position_embedding(
+                        width,
+                        height,
+                        self.encoder_hidden_dim,
+                        self.positional_encoding_temperature,
+                        device=src_flatten.device,
+                        dtype=src_flatten.dtype,
+                    )
+                else:
+                    pos_embed = None
+
+                layer_outputs = self.encoder[i](
+                    src_flatten,
+                    pos_embed=pos_embed,
+                    output_attentions=output_attentions,
+                )
+                hidden_states[enc_ind] = (
+                    layer_outputs[0].permute(0, 2, 1).reshape(-1, self.encoder_hidden_dim, height, width).contiguous()
+                )
+
+                if output_attentions:
+                    all_attentions = all_attentions + (layer_outputs[1],)
+
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states[enc_ind],)
+
+        # top-down FPN
+        fpn_feature_maps = [hidden_states[-1]]
+        for idx, (lateral_conv, fpn_block) in enumerate(zip(self.lateral_convs, self.fpn_blocks)):
+            backbone_feature_map = hidden_states[self.num_fpn_stages - idx - 1]
+            top_fpn_feature_map = fpn_feature_maps[-1]
+            # apply lateral block
+            top_fpn_feature_map = lateral_conv(top_fpn_feature_map)
+            fpn_feature_maps[-1] = top_fpn_feature_map
+            # apply fpn block
+            top_fpn_feature_map = F.interpolate(top_fpn_feature_map, scale_factor=2.0, mode="nearest")
+            fused_feature_map = torch.concat([top_fpn_feature_map, backbone_feature_map], dim=1)
+            new_fpn_feature_map = fpn_block(fused_feature_map)
+            fpn_feature_maps.append(new_fpn_feature_map)
+
+        fpn_feature_maps = fpn_feature_maps[::-1]
+
+        # bottom-up PAN
+        pan_feature_maps = [fpn_feature_maps[0]]
+        for idx, (downsample_conv, pan_block) in enumerate(zip(self.downsample_convs, self.pan_blocks)):
+            top_pan_feature_map = pan_feature_maps[-1]
+            fpn_feature_map = fpn_feature_maps[idx + 1]
+            downsampled_feature_map = downsample_conv(top_pan_feature_map)
+            fused_feature_map = torch.concat([downsampled_feature_map, fpn_feature_map], dim=1)
+            new_pan_feature_map = pan_block(fused_feature_map)
+            pan_feature_maps.append(new_pan_feature_map)
+
+        if not return_dict:
+            return tuple(v for v in [pan_feature_maps, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=pan_feature_maps, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class RTDetrDecoder(RTDetrPreTrainedModel):
+    def __init__(self, config: RTDetrConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layers = nn.ModuleList([RTDetrDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.query_pos_head = RTDetrMLPPredictionHead(config, 4, 2 * config.d_model, config.d_model, num_layers=2)
+
+        # hack implementation for iterative bounding box refinement and two-stage Deformable DETR
+        self.bbox_embed = None
+        self.class_embed = None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        position_embeddings=None,
+        reference_points=None,
+        spatial_shapes=None,
+        spatial_shapes_list=None,
+        level_start_index=None,
+        valid_ratios=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+                The query embeddings that are passed into the decoder.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
+                in `[0, 1]`:
+                - 1 for pixels that are real (i.e. **not masked**),
+                - 0 for pixels that are padding (i.e. **masked**).
+            position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+            reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*):
+                Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area.
+            spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of the feature maps.
+            level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*):
+                Indexes for the start of each feature level. In range `[0, sequence_length]`.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*):
+                Ratio of valid area in each feature level.
+
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        intermediate = ()
+        intermediate_reference_points = ()
+        intermediate_logits = ()
+
+        reference_points = F.sigmoid(reference_points)
+
+        # https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py#L252
+        for idx, decoder_layer in enumerate(self.layers):
+            reference_points_input = reference_points.unsqueeze(2)
+            position_embeddings = self.query_pos_head(reference_points)
+
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                position_embeddings=position_embeddings,
+                encoder_hidden_states=encoder_hidden_states,
+                reference_points=reference_points_input,
+                spatial_shapes=spatial_shapes,
+                spatial_shapes_list=spatial_shapes_list,
+                level_start_index=level_start_index,
+                encoder_attention_mask=encoder_attention_mask,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            # hack implementation for iterative bounding box refinement
+            if self.bbox_embed is not None:
+                predicted_corners = self.bbox_embed[idx](hidden_states)
+                new_reference_points = F.sigmoid(predicted_corners + inverse_sigmoid(reference_points))
+                reference_points = new_reference_points.detach()
+
+            intermediate += (hidden_states,)
+            intermediate_reference_points += (
+                (new_reference_points,) if self.bbox_embed is not None else (reference_points,)
+            )
+
+            if self.class_embed is not None:
+                logits = self.class_embed[idx](hidden_states)
+                intermediate_logits += (logits,)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # Keep batch_size as first dimension
+        intermediate = torch.stack(intermediate, dim=1)
+        intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1)
+        if self.class_embed is not None:
+            intermediate_logits = torch.stack(intermediate_logits, dim=1)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    intermediate,
+                    intermediate_logits,
+                    intermediate_reference_points,
+                    all_hidden_states,
+                    all_self_attns,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return RTDetrDecoderOutput(
+            last_hidden_state=hidden_states,
+            intermediate_hidden_states=intermediate,
+            intermediate_logits=intermediate_logits,
+            intermediate_reference_points=intermediate_reference_points,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+class RTDetrMLPPredictionHead(nn.Module):
+    """
+    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
+    height and width of a bounding box w.r.t. an image.
+
+    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+    Origin from https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_paddle/ppdet/modeling/transformers/utils.py#L453
+
+    """
+
+    def __init__(self, config, input_dim, d_model, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [d_model] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+@auto_docstring(
+    custom_intro="""
+    RT-DETR Model (consisting of a backbone and encoder-decoder) outputting raw hidden states without any head on top.
+    """
+)
+class RTDetrModel(RTDetrPreTrainedModel):
+    def __init__(self, config: RTDetrConfig):
+        super().__init__(config)
+
+        # Create backbone
+        self.backbone = RTDetrConvEncoder(config)
+        intermediate_channel_sizes = self.backbone.intermediate_channel_sizes
+
+        # Create encoder input projection layers
+        # https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_pytorch/src/zoo/rtdetr/hybrid_encoder.py#L212
+        num_backbone_outs = len(intermediate_channel_sizes)
+        encoder_input_proj_list = []
+        for _ in range(num_backbone_outs):
+            in_channels = intermediate_channel_sizes[_]
+            encoder_input_proj_list.append(
+                nn.Sequential(
+                    nn.Conv2d(in_channels, config.encoder_hidden_dim, kernel_size=1, bias=False),
+                    nn.BatchNorm2d(config.encoder_hidden_dim),
+                )
+            )
+        self.encoder_input_proj = nn.ModuleList(encoder_input_proj_list)
+
+        # Create encoder
+        self.encoder = RTDetrHybridEncoder(config)
+
+        # denoising part
+        if config.num_denoising > 0:
+            self.denoising_class_embed = nn.Embedding(
+                config.num_labels + 1, config.d_model, padding_idx=config.num_labels
+            )
+
+        # decoder embedding
+        if config.learn_initial_query:
+            self.weight_embedding = nn.Embedding(config.num_queries, config.d_model)
+
+        # encoder head
+        self.enc_output = nn.Sequential(
+            nn.Linear(config.d_model, config.d_model),
+            nn.LayerNorm(config.d_model, eps=config.layer_norm_eps),
+        )
+        self.enc_score_head = nn.Linear(config.d_model, config.num_labels)
+        self.enc_bbox_head = RTDetrMLPPredictionHead(config, config.d_model, config.d_model, 4, num_layers=3)
+
+        # init encoder output anchors and valid_mask
+        if config.anchor_image_size:
+            self.anchors, self.valid_mask = self.generate_anchors(dtype=self.dtype)
+
+        # Create decoder input projection layers
+        # https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py#L412
+        num_backbone_outs = len(config.decoder_in_channels)
+        decoder_input_proj_list = []
+        for _ in range(num_backbone_outs):
+            in_channels = config.decoder_in_channels[_]
+            decoder_input_proj_list.append(
+                nn.Sequential(
+                    nn.Conv2d(in_channels, config.d_model, kernel_size=1, bias=False),
+                    nn.BatchNorm2d(config.d_model, config.batch_norm_eps),
+                )
+            )
+        for _ in range(config.num_feature_levels - num_backbone_outs):
+            decoder_input_proj_list.append(
+                nn.Sequential(
+                    nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1, bias=False),
+                    nn.BatchNorm2d(config.d_model, config.batch_norm_eps),
+                )
+            )
+            in_channels = config.d_model
+        self.decoder_input_proj = nn.ModuleList(decoder_input_proj_list)
+
+        # decoder
+        self.decoder = RTDetrDecoder(config)
+
+        self.post_init()
+
+    def get_encoder(self):
+        return self.encoder
+
+    def freeze_backbone(self):
+        for param in self.backbone.parameters():
+            param.requires_grad_(False)
+
+    def unfreeze_backbone(self):
+        for param in self.backbone.parameters():
+            param.requires_grad_(True)
+
+    @compile_compatible_method_lru_cache(maxsize=32)
+    def generate_anchors(self, spatial_shapes=None, grid_size=0.05, device="cpu", dtype=torch.float32):
+        if spatial_shapes is None:
+            spatial_shapes = [
+                [int(self.config.anchor_image_size[0] / s), int(self.config.anchor_image_size[1] / s)]
+                for s in self.config.feat_strides
+            ]
+        anchors = []
+        for level, (height, width) in enumerate(spatial_shapes):
+            grid_y, grid_x = torch.meshgrid(
+                torch.arange(end=height, device=device).to(dtype),
+                torch.arange(end=width, device=device).to(dtype),
+                indexing="ij",
+            )
+            grid_xy = torch.stack([grid_x, grid_y], -1)
+            grid_xy = grid_xy.unsqueeze(0) + 0.5
+            grid_xy[..., 0] /= width
+            grid_xy[..., 1] /= height
+            wh = torch.ones_like(grid_xy) * grid_size * (2.0**level)
+            anchors.append(torch.concat([grid_xy, wh], -1).reshape(-1, height * width, 4))
+        # define the valid range for anchor coordinates
+        eps = 1e-2
+        anchors = torch.concat(anchors, 1)
+        valid_mask = ((anchors > eps) * (anchors < 1 - eps)).all(-1, keepdim=True)
+        anchors = torch.log(anchors / (1 - anchors))
+        anchors = torch.where(valid_mask, anchors, torch.tensor(torch.finfo(dtype).max, dtype=dtype, device=device))
+
+        return anchors, valid_mask
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[list[dict]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.FloatTensor], RTDetrModelOutput]:
+        r"""
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
+            can choose to directly pass a flattened representation of an image.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
+            embedded representation.
+        labels (`list[Dict]` of len `(batch_size,)`, *optional*):
+            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
+            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
+            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
+            in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, RTDetrModel
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("PekingU/rtdetr_r50vd")
+        >>> model = RTDetrModel.from_pretrained("PekingU/rtdetr_r50vd")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+
+        >>> last_hidden_states = outputs.last_hidden_state
+        >>> list(last_hidden_states.shape)
+        [1, 300, 256]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, num_channels, height, width = pixel_values.shape
+        device = pixel_values.device
+
+        if pixel_mask is None:
+            pixel_mask = torch.ones(((batch_size, height, width)), device=device)
+
+        features = self.backbone(pixel_values, pixel_mask)
+
+        proj_feats = [self.encoder_input_proj[level](source) for level, (source, mask) in enumerate(features)]
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                proj_feats,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if output_hidden_states else None,
+                attentions=encoder_outputs[2]
+                if len(encoder_outputs) > 2
+                else encoder_outputs[1]
+                if output_attentions
+                else None,
+            )
+
+        # Equivalent to def _get_encoder_input
+        # https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py#L412
+        sources = []
+        for level, source in enumerate(encoder_outputs[0]):
+            sources.append(self.decoder_input_proj[level](source))
+
+        # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage
+        if self.config.num_feature_levels > len(sources):
+            _len_sources = len(sources)
+            sources.append(self.decoder_input_proj[_len_sources](encoder_outputs[0])[-1])
+            for i in range(_len_sources + 1, self.config.num_feature_levels):
+                sources.append(self.decoder_input_proj[i](encoder_outputs[0][-1]))
+
+        # Prepare encoder inputs (by flattening)
+        source_flatten = []
+        spatial_shapes_list = []
+        spatial_shapes = torch.empty((len(sources), 2), device=device, dtype=torch.long)
+        for level, source in enumerate(sources):
+            height, width = source.shape[-2:]
+            spatial_shapes[level, 0] = height
+            spatial_shapes[level, 1] = width
+            spatial_shapes_list.append((height, width))
+            source = source.flatten(2).transpose(1, 2)
+            source_flatten.append(source)
+        source_flatten = torch.cat(source_flatten, 1)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+
+        # prepare denoising training
+        if self.training and self.config.num_denoising > 0 and labels is not None:
+            (
+                denoising_class,
+                denoising_bbox_unact,
+                attention_mask,
+                denoising_meta_values,
+            ) = get_contrastive_denoising_training_group(
+                targets=labels,
+                num_classes=self.config.num_labels,
+                num_queries=self.config.num_queries,
+                class_embed=self.denoising_class_embed,
+                num_denoising_queries=self.config.num_denoising,
+                label_noise_ratio=self.config.label_noise_ratio,
+                box_noise_scale=self.config.box_noise_scale,
+            )
+        else:
+            denoising_class, denoising_bbox_unact, attention_mask, denoising_meta_values = None, None, None, None
+
+        batch_size = len(source_flatten)
+        device = source_flatten.device
+        dtype = source_flatten.dtype
+
+        # prepare input for decoder
+        if self.training or self.config.anchor_image_size is None:
+            # Pass spatial_shapes as tuple to make it hashable and make sure
+            # lru_cache is working for generate_anchors()
+            spatial_shapes_tuple = tuple(spatial_shapes_list)
+            anchors, valid_mask = self.generate_anchors(spatial_shapes_tuple, device=device, dtype=dtype)
+        else:
+            anchors, valid_mask = self.anchors, self.valid_mask
+            anchors, valid_mask = anchors.to(device, dtype), valid_mask.to(device, dtype)
+
+        # use the valid_mask to selectively retain values in the feature map where the mask is `True`
+        memory = valid_mask.to(source_flatten.dtype) * source_flatten
+
+        output_memory = self.enc_output(memory)
+
+        enc_outputs_class = self.enc_score_head(output_memory)
+        enc_outputs_coord_logits = self.enc_bbox_head(output_memory) + anchors
+
+        _, topk_ind = torch.topk(enc_outputs_class.max(-1).values, self.config.num_queries, dim=1)
+
+        reference_points_unact = enc_outputs_coord_logits.gather(
+            dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_coord_logits.shape[-1])
+        )
+
+        enc_topk_bboxes = F.sigmoid(reference_points_unact)
+        if denoising_bbox_unact is not None:
+            reference_points_unact = torch.concat([denoising_bbox_unact, reference_points_unact], 1)
+
+        enc_topk_logits = enc_outputs_class.gather(
+            dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_class.shape[-1])
+        )
+
+        # extract region features
+        if self.config.learn_initial_query:
+            target = self.weight_embedding.tile([batch_size, 1, 1])
+        else:
+            target = output_memory.gather(dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, output_memory.shape[-1]))
+            target = target.detach()
+
+        if denoising_class is not None:
+            target = torch.concat([denoising_class, target], 1)
+
+        init_reference_points = reference_points_unact.detach()
+
+        # decoder
+        decoder_outputs = self.decoder(
+            inputs_embeds=target,
+            encoder_hidden_states=source_flatten,
+            encoder_attention_mask=attention_mask,
+            reference_points=init_reference_points,
+            spatial_shapes=spatial_shapes,
+            spatial_shapes_list=spatial_shapes_list,
+            level_start_index=level_start_index,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            enc_outputs = tuple(
+                value
+                for value in [enc_topk_logits, enc_topk_bboxes, enc_outputs_class, enc_outputs_coord_logits]
+                if value is not None
+            )
+            dn_outputs = tuple(value if value is not None else None for value in [denoising_meta_values])
+            tuple_outputs = decoder_outputs + encoder_outputs + (init_reference_points,) + enc_outputs + dn_outputs
+
+            return tuple_outputs
+
+        return RTDetrModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
+            intermediate_logits=decoder_outputs.intermediate_logits,
+            intermediate_reference_points=decoder_outputs.intermediate_reference_points,
+            intermediate_predicted_corners=decoder_outputs.intermediate_predicted_corners,
+            initial_reference_points=decoder_outputs.initial_reference_points,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            init_reference_points=init_reference_points,
+            enc_topk_logits=enc_topk_logits,
+            enc_topk_bboxes=enc_topk_bboxes,
+            enc_outputs_class=enc_outputs_class,
+            enc_outputs_coord_logits=enc_outputs_coord_logits,
+            denoising_meta_values=denoising_meta_values,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    RT-DETR Model (consisting of a backbone and encoder-decoder) outputting bounding boxes and logits to be further
+    decoded into scores and classes.
+    """
+)
+class RTDetrForObjectDetection(RTDetrPreTrainedModel):
+    # When using clones, all layers > 0 will be clones, but layer 0 *is* required
+    _tied_weights_keys = ["bbox_embed", "class_embed"]
+    # We can't initialize the model on meta device as some weights are modified during the initialization
+    _no_split_modules = None
+
+    def __init__(self, config: RTDetrConfig):
+        super().__init__(config)
+
+        # RTDETR encoder-decoder model
+        self.model = RTDetrModel(config)
+
+        # Detection heads on top
+        self.class_embed = partial(nn.Linear, config.d_model, config.num_labels)
+        self.bbox_embed = partial(RTDetrMLPPredictionHead, config, config.d_model, config.d_model, 4, num_layers=3)
+
+        # if two-stage, the last class_embed and bbox_embed is for region proposal generation
+        num_pred = config.decoder_layers
+        if config.with_box_refine:
+            self.class_embed = _get_clones(self.class_embed, num_pred)
+            self.bbox_embed = _get_clones(self.bbox_embed, num_pred)
+        else:
+            self.class_embed = nn.ModuleList([self.class_embed() for _ in range(num_pred)])
+            self.bbox_embed = nn.ModuleList([self.bbox_embed() for _ in range(num_pred)])
+
+        # hack implementation for iterative bounding box refinement
+        self.model.decoder.class_embed = self.class_embed
+        self.model.decoder.bbox_embed = self.bbox_embed
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class, outputs_coord)]
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[list[dict]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple[torch.FloatTensor], RTDetrObjectDetectionOutput]:
+        r"""
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
+            can choose to directly pass a flattened representation of an image.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
+            embedded representation.
+        labels (`list[Dict]` of len `(batch_size,)`, *optional*):
+            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
+            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
+            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
+            in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
+
+        Examples:
+
+        ```python
+        >>> from transformers import RTDetrImageProcessor, RTDetrForObjectDetection
+        >>> from PIL import Image
+        >>> import requests
+        >>> import torch
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd")
+        >>> model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd")
+
+        >>> # prepare image for the model
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+
+        >>> # forward pass
+        >>> outputs = model(**inputs)
+
+        >>> logits = outputs.logits
+        >>> list(logits.shape)
+        [1, 300, 80]
+
+        >>> boxes = outputs.pred_boxes
+        >>> list(boxes.shape)
+        [1, 300, 4]
+
+        >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
+        >>> target_sizes = torch.tensor([image.size[::-1]])
+        >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[
+        ...     0
+        ... ]
+
+        >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+        ...     box = [round(i, 2) for i in box.tolist()]
+        ...     print(
+        ...         f"Detected {model.config.id2label[label.item()]} with confidence "
+        ...         f"{round(score.item(), 3)} at location {box}"
+        ...     )
+        Detected sofa with confidence 0.97 at location [0.14, 0.38, 640.13, 476.21]
+        Detected cat with confidence 0.96 at location [343.38, 24.28, 640.14, 371.5]
+        Detected cat with confidence 0.958 at location [13.23, 54.18, 318.98, 472.22]
+        Detected remote with confidence 0.951 at location [40.11, 73.44, 175.96, 118.48]
+        Detected remote with confidence 0.924 at location [333.73, 76.58, 369.97, 186.99]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            pixel_values,
+            pixel_mask=pixel_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            labels=labels,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        denoising_meta_values = (
+            outputs.denoising_meta_values if return_dict else outputs[-1] if self.training else None
+        )
+
+        outputs_class = outputs.intermediate_logits if return_dict else outputs[2]
+        outputs_coord = outputs.intermediate_reference_points if return_dict else outputs[3]
+        predicted_corners = outputs.intermediate_predicted_corners if return_dict else outputs[4]
+        initial_reference_points = outputs.initial_reference_points if return_dict else outputs[5]
+
+        logits = outputs_class[:, -1]
+        pred_boxes = outputs_coord[:, -1]
+
+        loss, loss_dict, auxiliary_outputs, enc_topk_logits, enc_topk_bboxes = None, None, None, None, None
+        if labels is not None:
+            enc_topk_logits = outputs.enc_topk_logits if return_dict else outputs[-5]
+            enc_topk_bboxes = outputs.enc_topk_bboxes if return_dict else outputs[-4]
+            loss, loss_dict, auxiliary_outputs = self.loss_function(
+                logits,
+                labels,
+                self.device,
+                pred_boxes,
+                self.config,
+                outputs_class,
+                outputs_coord,
+                enc_topk_logits=enc_topk_logits,
+                enc_topk_bboxes=enc_topk_bboxes,
+                denoising_meta_values=denoising_meta_values,
+                predicted_corners=predicted_corners,
+                initial_reference_points=initial_reference_points,
+                **kwargs,
+            )
+
+        if not return_dict:
+            if auxiliary_outputs is not None:
+                output = (logits, pred_boxes) + (auxiliary_outputs,) + outputs
+            else:
+                output = (logits, pred_boxes) + outputs
+            return ((loss, loss_dict) + output) if loss is not None else output
+
+        return RTDetrObjectDetectionOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            auxiliary_outputs=auxiliary_outputs,
+            last_hidden_state=outputs.last_hidden_state,
+            intermediate_hidden_states=outputs.intermediate_hidden_states,
+            intermediate_logits=outputs.intermediate_logits,
+            intermediate_reference_points=outputs.intermediate_reference_points,
+            intermediate_predicted_corners=outputs.intermediate_predicted_corners,
+            initial_reference_points=outputs.initial_reference_points,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            init_reference_points=outputs.init_reference_points,
+            enc_topk_logits=outputs.enc_topk_logits,
+            enc_topk_bboxes=outputs.enc_topk_bboxes,
+            enc_outputs_class=outputs.enc_outputs_class,
+            enc_outputs_coord_logits=outputs.enc_outputs_coord_logits,
+            denoising_meta_values=outputs.denoising_meta_values,
+        )
+
+
+__all__ = [
+    "RTDetrForObjectDetection",
+    "RTDetrModel",
+    "RTDetrPreTrainedModel",
+]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/rt_detr/modeling_rt_detr_resnet.py b/venv/lib/python3.13/site-packages/transformers/models/rt_detr/modeling_rt_detr_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..21781dc3573fd8a16f44640ea790109aed720389
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/rt_detr/modeling_rt_detr_resnet.py
@@ -0,0 +1,399 @@
+# coding=utf-8
+# Copyright 2024 Microsoft Research, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+PyTorch RTDetr specific ResNet model. The main difference between hugginface ResNet model is that this RTDetrResNet model forces to use shortcut at the first layer in the resnet-18/34 models.
+See https://github.com/lyuwenyu/RT-DETR/blob/5b628eaa0a2fc25bdafec7e6148d5296b144af85/rtdetr_pytorch/src/nn/backbone/presnet.py#L126 for details.
+"""
+
+import math
+from typing import Optional
+
+from torch import Tensor, nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BackboneOutput, BaseModelOutputWithNoAttention
+from ...modeling_utils import PreTrainedModel
+from ...utils import auto_docstring, logging
+from ...utils.backbone_utils import BackboneMixin
+from .configuration_rt_detr_resnet import RTDetrResNetConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.resnet.modeling_resnet.ResNetConvLayer -> RTDetrResNetConvLayer
+class RTDetrResNetConvLayer(nn.Module):
+    def __init__(
+        self, in_channels: int, out_channels: int, kernel_size: int = 3, stride: int = 1, activation: str = "relu"
+    ):
+        super().__init__()
+        self.convolution = nn.Conv2d(
+            in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=kernel_size // 2, bias=False
+        )
+        self.normalization = nn.BatchNorm2d(out_channels)
+        self.activation = ACT2FN[activation] if activation is not None else nn.Identity()
+
+    def forward(self, input: Tensor) -> Tensor:
+        hidden_state = self.convolution(input)
+        hidden_state = self.normalization(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        return hidden_state
+
+
+class RTDetrResNetEmbeddings(nn.Module):
+    """
+    ResNet Embeddings (stem) composed of a deep aggressive convolution.
+    """
+
+    def __init__(self, config: RTDetrResNetConfig):
+        super().__init__()
+        self.embedder = nn.Sequential(
+            *[
+                RTDetrResNetConvLayer(
+                    config.num_channels,
+                    config.embedding_size // 2,
+                    kernel_size=3,
+                    stride=2,
+                    activation=config.hidden_act,
+                ),
+                RTDetrResNetConvLayer(
+                    config.embedding_size // 2,
+                    config.embedding_size // 2,
+                    kernel_size=3,
+                    stride=1,
+                    activation=config.hidden_act,
+                ),
+                RTDetrResNetConvLayer(
+                    config.embedding_size // 2,
+                    config.embedding_size,
+                    kernel_size=3,
+                    stride=1,
+                    activation=config.hidden_act,
+                ),
+            ]
+        )
+        self.pooler = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.num_channels = config.num_channels
+
+    def forward(self, pixel_values: Tensor) -> Tensor:
+        num_channels = pixel_values.shape[1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        embedding = self.embedder(pixel_values)
+        embedding = self.pooler(embedding)
+        return embedding
+
+
+# Copied from transformers.models.resnet.modeling_resnet.ResNetShortCut -> RTDetrResNetChortCut
+class RTDetrResNetShortCut(nn.Module):
+    """
+    ResNet shortcut, used to project the residual features to the correct size. If needed, it is also used to
+    downsample the input using `stride=2`.
+    """
+
+    def __init__(self, in_channels: int, out_channels: int, stride: int = 2):
+        super().__init__()
+        self.convolution = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False)
+        self.normalization = nn.BatchNorm2d(out_channels)
+
+    def forward(self, input: Tensor) -> Tensor:
+        hidden_state = self.convolution(input)
+        hidden_state = self.normalization(hidden_state)
+        return hidden_state
+
+
+class RTDetrResNetBasicLayer(nn.Module):
+    """
+    A classic ResNet's residual layer composed by two `3x3` convolutions.
+    See https://github.com/lyuwenyu/RT-DETR/blob/5b628eaa0a2fc25bdafec7e6148d5296b144af85/rtdetr_pytorch/src/nn/backbone/presnet.py#L34.
+    """
+
+    def __init__(
+        self,
+        config: RTDetrResNetConfig,
+        in_channels: int,
+        out_channels: int,
+        stride: int = 1,
+        should_apply_shortcut: bool = False,
+    ):
+        super().__init__()
+        if in_channels != out_channels:
+            self.shortcut = (
+                nn.Sequential(
+                    *[nn.AvgPool2d(2, 2, 0, ceil_mode=True), RTDetrResNetShortCut(in_channels, out_channels, stride=1)]
+                )
+                if should_apply_shortcut
+                else nn.Identity()
+            )
+        else:
+            self.shortcut = (
+                RTDetrResNetShortCut(in_channels, out_channels, stride=stride)
+                if should_apply_shortcut
+                else nn.Identity()
+            )
+        self.layer = nn.Sequential(
+            RTDetrResNetConvLayer(in_channels, out_channels, stride=stride),
+            RTDetrResNetConvLayer(out_channels, out_channels, activation=None),
+        )
+        self.activation = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_state):
+        residual = hidden_state
+        hidden_state = self.layer(hidden_state)
+        residual = self.shortcut(residual)
+        hidden_state += residual
+        hidden_state = self.activation(hidden_state)
+        return hidden_state
+
+
+class RTDetrResNetBottleNeckLayer(nn.Module):
+    """
+    A classic RTDetrResNet's bottleneck layer composed by three `3x3` convolutions.
+
+    The first `1x1` convolution reduces the input by a factor of `reduction` in order to make the second `3x3`
+    convolution faster. The last `1x1` convolution remaps the reduced features to `out_channels`. If
+    `downsample_in_bottleneck` is true, downsample will be in the first layer instead of the second layer.
+    """
+
+    def __init__(
+        self,
+        config: RTDetrResNetConfig,
+        in_channels: int,
+        out_channels: int,
+        stride: int = 1,
+    ):
+        super().__init__()
+        reduction = 4
+        should_apply_shortcut = in_channels != out_channels or stride != 1
+        reduces_channels = out_channels // reduction
+        if stride == 2:
+            self.shortcut = nn.Sequential(
+                *[
+                    nn.AvgPool2d(2, 2, 0, ceil_mode=True),
+                    RTDetrResNetShortCut(in_channels, out_channels, stride=1)
+                    if should_apply_shortcut
+                    else nn.Identity(),
+                ]
+            )
+        else:
+            self.shortcut = (
+                RTDetrResNetShortCut(in_channels, out_channels, stride=stride)
+                if should_apply_shortcut
+                else nn.Identity()
+            )
+        self.layer = nn.Sequential(
+            RTDetrResNetConvLayer(
+                in_channels, reduces_channels, kernel_size=1, stride=stride if config.downsample_in_bottleneck else 1
+            ),
+            RTDetrResNetConvLayer(
+                reduces_channels, reduces_channels, stride=stride if not config.downsample_in_bottleneck else 1
+            ),
+            RTDetrResNetConvLayer(reduces_channels, out_channels, kernel_size=1, activation=None),
+        )
+        self.activation = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_state):
+        residual = hidden_state
+        hidden_state = self.layer(hidden_state)
+        residual = self.shortcut(residual)
+        hidden_state += residual
+        hidden_state = self.activation(hidden_state)
+        return hidden_state
+
+
+class RTDetrResNetStage(nn.Module):
+    """
+    A RTDetrResNet stage composed by stacked layers.
+    """
+
+    def __init__(
+        self,
+        config: RTDetrResNetConfig,
+        in_channels: int,
+        out_channels: int,
+        stride: int = 2,
+        depth: int = 2,
+    ):
+        super().__init__()
+
+        layer = RTDetrResNetBottleNeckLayer if config.layer_type == "bottleneck" else RTDetrResNetBasicLayer
+
+        if config.layer_type == "bottleneck":
+            first_layer = layer(
+                config,
+                in_channels,
+                out_channels,
+                stride=stride,
+            )
+        else:
+            first_layer = layer(config, in_channels, out_channels, stride=stride, should_apply_shortcut=True)
+        self.layers = nn.Sequential(
+            first_layer, *[layer(config, out_channels, out_channels) for _ in range(depth - 1)]
+        )
+
+    def forward(self, input: Tensor) -> Tensor:
+        hidden_state = input
+        for layer in self.layers:
+            hidden_state = layer(hidden_state)
+        return hidden_state
+
+
+# Copied from transformers.models.resnet.modeling_resnet.ResNetEncoder with ResNet->RTDetrResNet
+class RTDetrResNetEncoder(nn.Module):
+    def __init__(self, config: RTDetrResNetConfig):
+        super().__init__()
+        self.stages = nn.ModuleList([])
+        # based on `downsample_in_first_stage` the first layer of the first stage may or may not downsample the input
+        self.stages.append(
+            RTDetrResNetStage(
+                config,
+                config.embedding_size,
+                config.hidden_sizes[0],
+                stride=2 if config.downsample_in_first_stage else 1,
+                depth=config.depths[0],
+            )
+        )
+        in_out_channels = zip(config.hidden_sizes, config.hidden_sizes[1:])
+        for (in_channels, out_channels), depth in zip(in_out_channels, config.depths[1:]):
+            self.stages.append(RTDetrResNetStage(config, in_channels, out_channels, depth=depth))
+
+    def forward(
+        self, hidden_state: Tensor, output_hidden_states: bool = False, return_dict: bool = True
+    ) -> BaseModelOutputWithNoAttention:
+        hidden_states = () if output_hidden_states else None
+
+        for stage_module in self.stages:
+            if output_hidden_states:
+                hidden_states = hidden_states + (hidden_state,)
+
+            hidden_state = stage_module(hidden_state)
+
+        if output_hidden_states:
+            hidden_states = hidden_states + (hidden_state,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_state, hidden_states] if v is not None)
+
+        return BaseModelOutputWithNoAttention(
+            last_hidden_state=hidden_state,
+            hidden_states=hidden_states,
+        )
+
+
+@auto_docstring
+# Copied from transformers.models.resnet.modeling_resnet.ResNetPreTrainedModel with ResNet->RTDetrResNet
+class RTDetrResNetPreTrainedModel(PreTrainedModel):
+    config: RTDetrResNetConfig
+    base_model_prefix = "resnet"
+    main_input_name = "pixel_values"
+    _no_split_modules = ["RTDetrResNetConvLayer", "RTDetrResNetShortCut"]
+
+    def _init_weights(self, module):
+        if isinstance(module, nn.Conv2d):
+            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+        # copied from the `reset_parameters` method of `class Linear(Module)` in `torch`.
+        elif isinstance(module, nn.Linear):
+            nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5))
+            if module.bias is not None:
+                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight)
+                bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                nn.init.uniform_(module.bias, -bound, bound)
+        elif isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
+            nn.init.constant_(module.weight, 1)
+            nn.init.constant_(module.bias, 0)
+
+
+@auto_docstring(
+    custom_intro="""
+    ResNet backbone, to be used with frameworks like RTDETR.
+    """
+)
+class RTDetrResNetBackbone(RTDetrResNetPreTrainedModel, BackboneMixin):
+    has_attentions = False
+
+    def __init__(self, config):
+        super().__init__(config)
+        super()._init_backbone(config)
+
+        self.num_features = [config.embedding_size] + config.hidden_sizes
+        self.embedder = RTDetrResNetEmbeddings(config)
+        self.encoder = RTDetrResNetEncoder(config)
+
+        # initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
+    ) -> BackboneOutput:
+        r"""
+                        Examples:
+
+                        ```python
+                        >>> from transformers import RTDetrResNetConfig, RTDetrResNetBackbone
+                        >>> import torch
+        from ...utils.deprecation import deprecate_kwarg
+        from ...utils.deprecation import deprecate_kwarg
+        from ...utils.deprecation import deprecate_kwarg
+                from ...utils.deprecation import deprecate_kwarg
+                from ...utils.deprecation import deprecate_kwarg
+
+                        >>> config = RTDetrResNetConfig()
+                        >>> model = RTDetrResNetBackbone(config)
+
+                        >>> pixel_values = torch.randn(1, 3, 224, 224)
+
+                        >>> with torch.no_grad():
+                        ...     outputs = model(pixel_values)
+
+                        >>> feature_maps = outputs.feature_maps
+                        >>> list(feature_maps[-1].shape)
+                        [1, 2048, 7, 7]
+                        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        embedding_output = self.embedder(pixel_values)
+
+        outputs = self.encoder(embedding_output, output_hidden_states=True, return_dict=True)
+
+        hidden_states = outputs.hidden_states
+
+        feature_maps = ()
+        for idx, stage in enumerate(self.stage_names):
+            if stage in self.out_features:
+                feature_maps += (hidden_states[idx],)
+
+        if not return_dict:
+            output = (feature_maps,)
+            if output_hidden_states:
+                output += (outputs.hidden_states,)
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=None,
+        )
+
+
+__all__ = [
+    "RTDetrResNetBackbone",
+    "RTDetrResNetPreTrainedModel",
+]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/rt_detr/modular_rt_detr.py b/venv/lib/python3.13/site-packages/transformers/models/rt_detr/modular_rt_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..61bd055144f0504aaac3ab256b3565f82411a87d
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/rt_detr/modular_rt_detr.py
@@ -0,0 +1,355 @@
+import pathlib
+from typing import Optional, Union
+
+import torch
+from torchvision.transforms.v2 import functional as F
+
+from transformers.models.detr.image_processing_detr_fast import DetrFastImageProcessorKwargs, DetrImageProcessorFast
+
+from ...image_processing_utils import BatchFeature
+from ...image_processing_utils_fast import BaseImageProcessorFast, SizeDict, get_max_height_width
+from ...image_transforms import center_to_corners_format
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    AnnotationFormat,
+    AnnotationType,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    validate_annotations,
+)
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    logging,
+    requires_backends,
+)
+
+
+logger = logging.get_logger(__name__)
+
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION,)
+
+
+def prepare_coco_detection_annotation(
+    image,
+    target,
+    return_segmentation_masks: bool = False,
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
+    """
+    Convert the target in COCO format into the format expected by RT-DETR.
+    """
+    image_height, image_width = image.size()[-2:]
+
+    image_id = target["image_id"]
+    image_id = torch.as_tensor([image_id], dtype=torch.int64, device=image.device)
+
+    # Get all COCO annotations for the given image.
+    annotations = target["annotations"]
+    classes = []
+    area = []
+    boxes = []
+    keypoints = []
+    for obj in annotations:
+        if "iscrowd" not in obj or obj["iscrowd"] == 0:
+            classes.append(obj["category_id"])
+            area.append(obj["area"])
+            boxes.append(obj["bbox"])
+            if "keypoints" in obj:
+                keypoints.append(obj["keypoints"])
+
+    classes = torch.as_tensor(classes, dtype=torch.int64, device=image.device)
+    area = torch.as_tensor(area, dtype=torch.float32, device=image.device)
+    iscrowd = torch.zeros_like(classes, dtype=torch.int64, device=image.device)
+    # guard against no boxes via resizing
+    boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device).reshape(-1, 4)
+    boxes[:, 2:] += boxes[:, :2]
+    boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
+    boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
+
+    keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+
+    new_target = {
+        "image_id": image_id,
+        "class_labels": classes[keep],
+        "boxes": boxes[keep],
+        "area": area[keep],
+        "iscrowd": iscrowd[keep],
+        "orig_size": torch.as_tensor([int(image_height), int(image_width)], dtype=torch.int64, device=image.device),
+    }
+
+    if keypoints:
+        keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=image.device)
+        # Apply the keep mask here to filter the relevant annotations
+        keypoints = keypoints[keep]
+        num_keypoints = keypoints.shape[0]
+        keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
+        new_target["keypoints"] = keypoints
+
+    return new_target
+
+
+class RTDetrFastImageProcessorKwargs(DetrFastImageProcessorKwargs):
+    pass
+
+
+class RTDetrImageProcessorFast(DetrImageProcessorFast):
+    resample = PILImageResampling.BILINEAR
+    image_mean = IMAGENET_DEFAULT_MEAN
+    image_std = IMAGENET_DEFAULT_STD
+    format = AnnotationFormat.COCO_DETECTION
+    do_convert_annotations = True
+    do_resize = True
+    do_rescale = True
+    do_normalize = False
+    do_pad = False
+    size = {"height": 640, "width": 640}
+    default_to_square = False
+    model_input_names = ["pixel_values", "pixel_mask"]
+    valid_kwargs = RTDetrFastImageProcessorKwargs
+
+    def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorKwargs]) -> None:
+        # Backwards compatibility
+        do_convert_annotations = kwargs.get("do_convert_annotations")
+        do_normalize = kwargs.get("do_normalize")
+        if do_convert_annotations is None and getattr(self, "do_convert_annotations", None) is None:
+            self.do_convert_annotations = do_normalize if do_normalize is not None else self.do_normalize
+
+        BaseImageProcessorFast.__init__(self, **kwargs)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        **kwargs: Unpack[RTDetrFastImageProcessorKwargs],
+    ) -> BatchFeature:
+        return BaseImageProcessorFast.preprocess(self, images, annotations, masks_path, **kwargs)
+
+    def prepare_annotation(
+        self,
+        image: torch.Tensor,
+        target: dict,
+        format: Optional[AnnotationFormat] = None,
+        return_segmentation_masks: Optional[bool] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> dict:
+        format = format if format is not None else self.format
+
+        if format == AnnotationFormat.COCO_DETECTION:
+            return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_detection_annotation(
+                image, target, return_segmentation_masks, input_data_format=input_data_format
+            )
+        else:
+            raise ValueError(f"Format {format} is not supported.")
+        return target
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        annotations: Optional[Union[AnnotationType, list[AnnotationType]]],
+        masks_path: Optional[Union[str, pathlib.Path]],
+        return_segmentation_masks: bool,
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        do_convert_annotations: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        do_pad: bool,
+        pad_size: Optional[SizeDict],
+        format: Optional[Union[str, AnnotationFormat]],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess an image or a batch of images so that it can be used by the model.
+        """
+
+        if annotations is not None and isinstance(annotations, dict):
+            annotations = [annotations]
+
+        if annotations is not None and len(images) != len(annotations):
+            raise ValueError(
+                f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
+            )
+
+        format = AnnotationFormat(format)
+        if annotations is not None:
+            validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
+
+        data = {}
+        processed_images = []
+        processed_annotations = []
+        pixel_masks = []  # Initialize pixel_masks here
+        for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
+            # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
+            if annotations is not None:
+                annotation = self.prepare_annotation(
+                    image,
+                    annotation,
+                    format,
+                    return_segmentation_masks=return_segmentation_masks,
+                    masks_path=masks_path,
+                    input_data_format=ChannelDimension.FIRST,
+                )
+
+            if do_resize:
+                resized_image = self.resize(image, size=size, interpolation=interpolation)
+                if annotations is not None:
+                    annotation = self.resize_annotation(
+                        annotation,
+                        orig_size=image.size()[-2:],
+                        target_size=resized_image.size()[-2:],
+                    )
+                image = resized_image
+            # Fused rescale and normalize
+            image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std)
+            if do_convert_annotations and annotations is not None:
+                annotation = self.normalize_annotation(annotation, get_image_size(image, ChannelDimension.FIRST))
+
+            processed_images.append(image)
+            processed_annotations.append(annotation)
+        images = processed_images
+        annotations = processed_annotations if annotations is not None else None
+
+        if do_pad:
+            # depends on all resized image shapes so we need another loop
+            if pad_size is not None:
+                padded_size = (pad_size.height, pad_size.width)
+            else:
+                padded_size = get_max_height_width(images)
+
+            padded_images = []
+            padded_annotations = []
+            for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
+                # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
+                if padded_size == image.size()[-2:]:
+                    padded_images.append(image)
+                    pixel_masks.append(torch.ones(padded_size, dtype=torch.int64, device=image.device))
+                    padded_annotations.append(annotation)
+                    continue
+                image, pixel_mask, annotation = self.pad(
+                    image, padded_size, annotation=annotation, update_bboxes=do_convert_annotations
+                )
+                padded_images.append(image)
+                padded_annotations.append(annotation)
+                pixel_masks.append(pixel_mask)
+            images = padded_images
+            annotations = padded_annotations if annotations is not None else None
+            data.update({"pixel_mask": torch.stack(pixel_masks, dim=0)})
+
+        data.update({"pixel_values": torch.stack(images, dim=0)})
+        encoded_inputs = BatchFeature(data, tensor_type=return_tensors)
+        if annotations is not None:
+            encoded_inputs["labels"] = [
+                BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
+            ]
+        return encoded_inputs
+
+    def post_process_object_detection(
+        self,
+        outputs,
+        threshold: float = 0.5,
+        target_sizes: Union[TensorType, list[tuple]] = None,
+        use_focal_loss: bool = True,
+    ):
+        """
+        Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`DetrObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*, defaults to 0.5):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size
+                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
+            use_focal_loss (`bool` defaults to `True`):
+                Variable informing if the focal loss was used to predict the outputs. If `True`, a sigmoid is applied
+                to compute the scores of each detection, otherwise, a softmax function is used.
+
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        requires_backends(self, ["torch"])
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+        # convert from relative cxcywh to absolute xyxy
+        boxes = center_to_corners_format(out_bbox)
+        if target_sizes is not None:
+            if len(out_logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+            if isinstance(target_sizes, list):
+                img_h, img_w = torch.as_tensor(target_sizes).unbind(1)
+            else:
+                img_h, img_w = target_sizes.unbind(1)
+            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+            boxes = boxes * scale_fct[:, None, :]
+
+        num_top_queries = out_logits.shape[1]
+        num_classes = out_logits.shape[2]
+
+        if use_focal_loss:
+            scores = torch.nn.functional.sigmoid(out_logits)
+            scores, index = torch.topk(scores.flatten(1), num_top_queries, axis=-1)
+            labels = index % num_classes
+            index = index // num_classes
+            boxes = boxes.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, boxes.shape[-1]))
+        else:
+            scores = torch.nn.functional.softmax(out_logits)[:, :, :-1]
+            scores, labels = scores.max(dim=-1)
+            if scores.shape[1] > num_top_queries:
+                scores, index = torch.topk(scores, num_top_queries, dim=-1)
+                labels = torch.gather(labels, dim=1, index=index)
+                boxes = torch.gather(boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1]))
+
+        results = []
+        for score, label, box in zip(scores, labels, boxes):
+            results.append(
+                {
+                    "scores": score[score > threshold],
+                    "labels": label[score > threshold],
+                    "boxes": box[score > threshold],
+                }
+            )
+
+        return results
+
+    def from_dict(self):
+        raise NotImplementedError("No need to override this method for RT-DETR yet.")
+
+    def post_process(self):
+        raise NotImplementedError("Post-processing is not implemented for RT-DETR yet.")
+
+    def post_process_segmentation(self):
+        raise NotImplementedError("Segmentation post-processing is not implemented for RT-DETR yet.")
+
+    def post_process_instance(self):
+        raise NotImplementedError("Instance post-processing is not implemented for RT-DETR yet.")
+
+    def post_process_panoptic(self):
+        raise NotImplementedError("Panoptic post-processing is not implemented for RT-DETR yet.")
+
+    def post_process_instance_segmentation(self):
+        raise NotImplementedError("Segmentation post-processing is not implemented for RT-DETR yet.")
+
+    def post_process_semantic_segmentation(self):
+        raise NotImplementedError("Semantic segmentation post-processing is not implemented for RT-DETR yet.")
+
+    def post_process_panoptic_segmentation(self):
+        raise NotImplementedError("Panoptic segmentation post-processing is not implemented for RT-DETR yet.")
+
+
+__all__ = ["RTDetrImageProcessorFast"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/seamless_m4t_v2/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/seamless_m4t_v2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..10f256f8575595d6b3409c030e1d6c3b33c75aa9
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/seamless_m4t_v2/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_seamless_m4t_v2 import *
+    from .modeling_seamless_m4t_v2 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py b/venv/lib/python3.13/site-packages/transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ac6c4024a5a6dd3dc5f60472974c771bf27e27e
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py
@@ -0,0 +1,425 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""SeamlessM4Tv2 model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class SeamlessM4Tv2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`~SeamlessM4Tv2Model`]. It is used to instantiate
+    an SeamlessM4Tv2 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the SeamlessM4Tv2
+    [""](https://huggingface.co/"") architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 256102):
+            Vocabulary size of the text modality of the SeamlessM4Tv2 model. Defines the number of different tokens
+            that can be represented by the `inputs_ids` passed when calling [`~SeamlessM4Tv2Model`],
+            [`~SeamlessM4Tv2ForTextToSpeech`] or [`~SeamlessM4Tv2ForTextToText`].
+        t2u_vocab_size (`int`, *optional*, defaults to 10082):
+            Unit vocabulary size of the SeamlessM4Tv2 model. Defines the number of different "unit tokens" that can be
+            represented by the `inputs_ids` passed when calling the Text-To-Units sub-model of [`~SeamlessM4Tv2Model`],
+            [`~SeamlessM4Tv2ForSpeechToSpeech`] or [`~SeamlessM4Tv2ForTextToSpeech`].
+        char_vocab_size (`int`, *optional*, defaults to 10943):
+            Character vocabulary size of the SeamlessM4Tv2 model. Defines the number of different character tokens that
+            can be represented by the `char_inputs_ids` passed when calling the Text-To-Units sub-model of
+            [`~SeamlessM4Tv2Model`], [`~SeamlessM4Tv2ForSpeechToSpeech`] or [`~SeamlessM4Tv2ForTextToSpeech`].
+
+        > Parameters shared across sub-models
+
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the "intermediate" layers in the architecture.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model text encoder and decoder might ever be used with. Typically set
+            this to something large just in case (e.g., 512 or 1024 or 2048).
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Whether the model is used as an encoder/decoder or not.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.05):
+            The LayerDrop probability for the encoders. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556)
+            for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.05):
+            The LayerDrop probability for the decoders. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556)
+            for more details.
+        activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the decoder and feed-forward layers. If string,
+            `"gelu"`, `"relu"`, `"selu"`, `"swish"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, decoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all attention layers.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all activation layers in the model.
+        scale_embedding (`bool`, *optional*, defaults to `True`):
+            Scale embeddings by diving by sqrt(d_model).
+
+        > Text encoder and text decoder specific parameters
+
+        encoder_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer text encoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 8192):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer text encoder.
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer text encoder.
+        decoder_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer text decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 8192):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer text decoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer text decoder.
+        decoder_start_token_id (`int`, *optional*, defaults to 3):
+            If an encoder-decoder model starts decoding with a different token than _bos_, the id of that token. Only
+            applied in the text decoder.
+        max_new_tokens (`int`, *optional*, defaults to 256):
+            The maximum numbers of text tokens to generate, ignoring the number of tokens in the prompt.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the _padding_ text token. Only applied to the text-decoder model.
+        bos_token_id (`int`, *optional*, defaults to 2):
+            The id of the _beginning-of-stream_ text token. Only applied to the text-decoder model.
+        eos_token_id (`int`, *optional*, defaults to 3):
+            The id of the _end-of-stream_ text token. Only applied to the text-decoder model.
+
+        > Speech encoder specific parameters
+
+        speech_encoder_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer speech encoder.
+        speech_encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer speech encoder.
+        speech_encoder_intermediate_size (`int`, *optional*, defaults to 4096):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer speech encoder.
+        speech_encoder_hidden_act (`str` or `function`, *optional*, defaults to `"swish"`):
+            The non-linear activation function (function or string) in the speech encoder. If string, `"gelu"`,
+            `"relu"`, `"selu"`, `"swish"` and `"gelu_new"` are supported.
+        speech_encoder_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all layers in the speech encoder.
+        add_adapter (`bool`, *optional*, defaults to `True`):
+            Add an adapter layer on top of the speech encoder.
+        speech_encoder_layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability for the speech encoder. See the [LayerDrop paper](see
+            https://huggingface.co/papers/1909.11556) for more details.
+        feature_projection_input_dim (`int`, *optional*, defaults to 160):
+            Input dimension of the input feature projection of the speech encoder, i.e the dimension after processing
+            input audios with [`SeamlessM4TFeatureExtractor`].
+        adaptor_kernel_size (`int`, *optional*, defaults to 8):
+            Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        adaptor_stride (`int`, *optional*, defaults to 8):
+            Stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        adaptor_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all layers in the speech adapter.
+        num_adapter_layers (`int`, *optional*, defaults to 1):
+            Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is
+            True`.
+        position_embeddings_type (`str`, *optional*, defaults to `"relative_key"`):
+            Can be specified to `relative_key`. If left to `None`, no relative position embedding is applied. Only
+            applied to the speech encoder. For more information on `"relative_key"`, please refer to [Self-Attention
+            with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155).
+        conv_depthwise_kernel_size (`int`, *optional*, defaults to 31):
+            Kernel size of convolutional depthwise 1D layer in Conformer blocks. Only applied to the speech encoder.
+        left_max_position_embeddings (`int`, *optional*, defaults to 64):
+            The left clipping value for relative positions.
+        right_max_position_embeddings (`int`, *optional*, defaults to 8):
+            The right clipping value for relative positions.
+        speech_encoder_chunk_size (`int`, *optional*, defaults to 20000): The size of each attention chunk.
+        speech_encoder_left_chunk_num (`int`, *optional*, defaults to 128):
+            Number of chunks on the left up to which lookahead is allowed.
+
+        > Text-To-Unit (t2u) model specific parameters
+
+        t2u_bos_token_id (`int`, *optional*, defaults to 0):
+            The id of the _beginning-of-stream_ unit token. Only applied to the text-to-unit seq2seq model.
+        t2u_pad_token_id (`int`, *optional*, defaults to 1):
+            The id of the _padding_ unit token. Only applied to the text-to-unit seq2seq model.
+        t2u_eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the _end-of-stream_ unit token. Only applied to the text-to-unit seq2seq model.
+        t2u_encoder_layers (`int`, *optional*, defaults to 6):
+            Number of hidden layers in the Transformer text-to-unit encoder.
+        t2u_encoder_ffn_dim (`int`, *optional*, defaults to 8192):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer text-to-unit encoder.
+        t2u_encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer text-to-unit encoder.
+        t2u_decoder_layers (`int`, *optional*, defaults to 6):
+            Number of hidden layers in the Transformer text-to-unit decoder.
+        t2u_decoder_ffn_dim (`int`, *optional*, defaults to 8192):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer text-to-unit decoder.
+        t2u_decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer text-to-unit decoder.
+        t2u_max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model text-to-unit component might ever be used with. Typically set
+            this to something large just in case (e.g., 512 or 1024 or 2048).
+        t2u_variance_predictor_embed_dim (`int`, *optional*, defaults to 1024):
+            The projection dimension of the text-to-unit's duration predictor.
+        t2u_variance_predictor_hidden_dim (`int`, *optional*, defaults to 256):
+            Internal dimension of the text-to-unit's duration predictor.
+        t2u_variance_predictor_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size of the convolutional layers of the text-to-unit's duration predictor.
+        t2u_variance_pred_dropout (`float`, *optional*, defaults to 0.5):
+            The dropout probability of the text-to-unit's duration predictor.
+
+         > Hifi-Gan Vocoder specific parameters
+
+        sampling_rate (`int`, *optional*, defaults to 16000):
+            The sampling rate at which the output audio will be generated, expressed in hertz (Hz).
+        upsample_initial_channel (`int`, *optional*, defaults to 512):
+            The number of input channels into the hifi-gan upsampling network. Applies to the vocoder only.
+        upsample_rates (`tuple[int]` or `list[int]`, *optional*, defaults to `[5, 4, 4, 2, 2]`):
+            A tuple of integers defining the stride of each 1D convolutional layer in the vocoder upsampling network.
+            The length of *upsample_rates* defines the number of convolutional layers and has to match the length of
+            *upsample_kernel_sizes*. Applies to the vocoder only.
+        upsample_kernel_sizes (`tuple[int]` or `list[int]`, *optional*, defaults to `[11, 8, 8, 4, 4]`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the vocoder upsampling
+            network. The length of *upsample_kernel_sizes* defines the number of convolutional layers and has to match
+            the length of *upsample_rates*. Applies to the vocoder only.
+        resblock_kernel_sizes (`tuple[int]` or `list[int]`, *optional*, defaults to `[3, 7, 11]`):
+            A tuple of integers defining the kernel sizes of the vocoder 1D convolutional layers in the multi-receptive
+            field fusion (MRF) module. Applies to the vocoder only.
+        resblock_dilation_sizes (`tuple[tuple[int]]` or `list[list[int]]`, *optional*, defaults to `[[1, 3, 5], [1, 3, 5], [1, 3, 5]]`):
+            A nested tuple of integers defining the dilation rates of the vocoder dilated 1D convolutional layers in
+            the multi-receptive field fusion (MRF) module. Applies to the vocoder only.
+        leaky_relu_slope (`float`, *optional*, defaults to 0.1):
+            The angle of the negative slope used by the leaky ReLU activation in the vocoder. Applies to the vocoder
+            only.
+        unit_hifi_gan_vocab_size (`int`, *optional*, defaults to 10000):
+            Vocabulary size of the SeamlessM4Tv2 vocoder. Defines the number of different unit tokens that can be
+            represented by the `inputs_ids` passed when calling the vocoder of [`~SeamlessM4Tv2Model`],
+            [`~SeamlessM4Tv2ForSpeechToSpeech`] or [`~SeamlessM4Tv2ForTextToSpeech`].
+        unit_embed_dim (`int`, *optional*, defaults to 1280):
+            The projection dimension of the input ids given to the hifi-gan vocoder. Applies to the vocoder only.
+        lang_embed_dim (`int`, *optional*, defaults to 256):
+            The projection dimension of the target language given to the hifi-gan vocoder. Applies to the vocoder only.
+        spkr_embed_dim (`int`, *optional*, defaults to 256):
+            The projection dimension of the speaker id given to the hifi-gan vocoder. Applies to the vocoder only.
+        vocoder_num_langs (`int`, *optional*, defaults to 36):
+            Number of langs supported by the vocoder. Might be different from `t2u_num_langs`.
+        vocoder_num_spkrs (`int`, *optional*, defaults to 200):
+            Number of speakers supported by the vocoder.
+        variance_predictor_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size of the duration predictor. Applies to the vocoder only.
+        var_pred_dropout (`float`, *optional*, defaults to 0.5):
+            The dropout probability of the duration predictor. Applies to the vocoder only.
+        vocoder_offset (`int`, *optional*, defaults to 4):
+            Offset the unit token ids by this number to account for symbol tokens. Applies to the vocoder only.
+
+    ```python
+    >>> from transformers import SeamlessM4Tv2Model, SeamlessM4Tv2Config
+
+    >>> # Initializing a SeamlessM4Tv2 "" style configuration
+    >>> configuration = SeamlessM4Tv2Config()
+
+    >>> # Initializing a model from the "" style configuration
+    >>> model = SeamlessM4Tv2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "seamless_m4t_v2"
+
+    def __init__(
+        self,
+        vocab_size=256102,
+        t2u_vocab_size=10082,
+        char_vocab_size=10943,
+        # shared config
+        hidden_size=1024,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        use_cache=True,
+        max_position_embeddings=4096,
+        is_encoder_decoder=True,
+        encoder_layerdrop=0.05,
+        decoder_layerdrop=0.05,
+        activation_function="relu",
+        dropout=0.1,
+        attention_dropout=0.1,
+        activation_dropout=0.0,
+        scale_embedding=True,
+        # text encoder|decoder
+        encoder_layers=24,
+        encoder_ffn_dim=8192,
+        encoder_attention_heads=16,
+        decoder_layers=24,
+        decoder_ffn_dim=8192,
+        decoder_attention_heads=16,
+        decoder_start_token_id=3,
+        max_new_tokens=256,
+        pad_token_id=0,
+        bos_token_id=2,
+        eos_token_id=3,
+        # speech_encoder
+        speech_encoder_layers=24,
+        speech_encoder_attention_heads=16,
+        speech_encoder_intermediate_size=4096,
+        speech_encoder_hidden_act="swish",
+        speech_encoder_dropout=0.0,
+        add_adapter=True,
+        speech_encoder_layerdrop=0.1,
+        feature_projection_input_dim=160,
+        adaptor_kernel_size=8,
+        adaptor_stride=8,
+        adaptor_dropout=0.1,
+        num_adapter_layers=1,
+        position_embeddings_type="relative_key",
+        conv_depthwise_kernel_size=31,
+        left_max_position_embeddings=64,
+        right_max_position_embeddings=8,
+        speech_encoder_chunk_size=20000,
+        speech_encoder_left_chunk_num=128,
+        # t2u config
+        t2u_bos_token_id=0,
+        t2u_pad_token_id=1,
+        t2u_eos_token_id=2,
+        t2u_encoder_layers=6,
+        t2u_encoder_ffn_dim=8192,
+        t2u_encoder_attention_heads=16,
+        t2u_decoder_layers=6,
+        t2u_decoder_ffn_dim=8192,
+        t2u_decoder_attention_heads=16,
+        t2u_max_position_embeddings=4096,
+        t2u_variance_predictor_embed_dim=1024,
+        t2u_variance_predictor_hidden_dim=256,
+        t2u_variance_predictor_kernel_size=3,
+        t2u_variance_pred_dropout=0.5,
+        # hifi-gan vocoder config
+        sampling_rate=16000,
+        upsample_initial_channel=512,
+        upsample_rates=[5, 4, 4, 2, 2],
+        upsample_kernel_sizes=[11, 8, 8, 4, 4],
+        resblock_kernel_sizes=[3, 7, 11],
+        resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+        leaky_relu_slope=0.1,
+        # specific to Code Hifi-Gan
+        unit_hifi_gan_vocab_size=10000,
+        unit_embed_dim=1280,
+        lang_embed_dim=256,
+        spkr_embed_dim=256,
+        vocoder_num_langs=36,
+        vocoder_num_spkrs=200,
+        variance_predictor_kernel_size=3,
+        var_pred_dropout=0.5,
+        vocoder_offset=4,
+        **kwargs,
+    ):
+        # overall_config
+        self.vocab_size = vocab_size
+        self.t2u_vocab_size = t2u_vocab_size
+        self.char_vocab_size = char_vocab_size
+        self.hidden_size = hidden_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.max_position_embeddings = max_position_embeddings
+        self.use_cache = use_cache
+        self.max_new_tokens = max_new_tokens
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.activation_function = activation_function
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.scale_embedding = scale_embedding
+        # for proper config init
+        self.num_attention_heads = decoder_attention_heads
+        self.num_hidden_layers = decoder_layers
+
+        # text|unit encoder|decoder
+        self.encoder_layers = encoder_layers
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_layers = decoder_layers
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_attention_heads = decoder_attention_heads
+
+        # speech_encoder
+        self.speech_encoder_layers = speech_encoder_layers
+        self.speech_encoder_hidden_act = speech_encoder_hidden_act
+        self.speech_encoder_dropout = speech_encoder_dropout
+        self.speech_encoder_attention_heads = speech_encoder_attention_heads
+        self.speech_encoder_layerdrop = speech_encoder_layerdrop
+        self.speech_encoder_intermediate_size = speech_encoder_intermediate_size
+        self.feature_projection_input_dim = feature_projection_input_dim
+        self.adaptor_kernel_size = adaptor_kernel_size
+        self.adaptor_stride = adaptor_stride
+        self.adaptor_dropout = adaptor_dropout
+        self.num_adapter_layers = num_adapter_layers
+        self.position_embeddings_type = position_embeddings_type
+        self.conv_depthwise_kernel_size = conv_depthwise_kernel_size
+        self.add_adapter = add_adapter
+        self.left_max_position_embeddings = left_max_position_embeddings
+        self.right_max_position_embeddings = right_max_position_embeddings
+        self.speech_encoder_chunk_size = speech_encoder_chunk_size
+        self.speech_encoder_left_chunk_num = speech_encoder_left_chunk_num
+
+        # t2u config
+        self.t2u_bos_token_id = t2u_bos_token_id
+        self.t2u_pad_token_id = t2u_pad_token_id
+        self.t2u_eos_token_id = t2u_eos_token_id
+        self.t2u_encoder_layers = t2u_encoder_layers
+        self.t2u_encoder_ffn_dim = t2u_encoder_ffn_dim
+        self.t2u_encoder_attention_heads = t2u_encoder_attention_heads
+        self.t2u_decoder_layers = t2u_decoder_layers
+        self.t2u_decoder_ffn_dim = t2u_decoder_ffn_dim
+        self.t2u_decoder_attention_heads = t2u_decoder_attention_heads
+        self.t2u_max_position_embeddings = t2u_max_position_embeddings
+        self.t2u_variance_predictor_embed_dim = t2u_variance_predictor_embed_dim  # TODO: add to docstrings
+        self.t2u_variance_predictor_hidden_dim = t2u_variance_predictor_hidden_dim  # TODO: add to docstrings
+        self.t2u_variance_predictor_kernel_size = t2u_variance_predictor_kernel_size  # TODO: add to docstrings
+        self.t2u_variance_pred_dropout = t2u_variance_pred_dropout  # TODO: add to docstrings
+
+        # hifi-gan vocoder config
+        # original parameters specific to Hifi-Gan
+        self.sampling_rate = sampling_rate
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_rates = upsample_rates
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.leaky_relu_slope = leaky_relu_slope
+
+        # specific to Code Hifi-Gan
+        self.unit_hifi_gan_vocab_size = unit_hifi_gan_vocab_size
+        self.unit_embed_dim = unit_embed_dim
+        self.lang_embed_dim = lang_embed_dim
+        self.spkr_embed_dim = spkr_embed_dim
+        self.vocoder_num_langs = vocoder_num_langs
+        self.vocoder_num_spkrs = vocoder_num_spkrs
+        self.variance_predictor_kernel_size = variance_predictor_kernel_size
+        self.var_pred_dropout = var_pred_dropout
+        self.vocoder_offset = vocoder_offset
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            decoder_start_token_id=decoder_start_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            max_position_embeddings=max_position_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["SeamlessM4Tv2Config"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py b/venv/lib/python3.13/site-packages/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf883374b68f23e195327b153d2a3855b43820b1
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
@@ -0,0 +1,4398 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch SeamlessM4Tv2 model."""
+
+import copy
+import math
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import Tensor, nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
+from ...generation import GenerationMixin
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+    Wav2Vec2BaseModelOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import ModelOutput, auto_docstring, logging
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_seamless_m4t_v2 import SeamlessM4Tv2Config
+
+
+logger = logging.get_logger(__name__)
+
+SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS = r"""
+    input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
+        Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
+        [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
+    decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+        Indices of decoder input sequence tokens in the vocabulary.
+
+        Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+        [`PreTrainedTokenizer.__call__`] for details.
+
+        [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+        Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
+        is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
+
+        For translation and summarization training, `decoder_input_ids` should be provided. If no
+        `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
+        for denoising pre-training following the paper.
+    decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+        Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+        be used by default.
+
+        If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`]
+        and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
+        information on the default strategy.
+    inputs_embeds (`torch.FloatTensor` of shape`(batch_size, sequence_length, hidden_size)`, *optional*):
+        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+        is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+        model's internal embedding lookup matrix.
+"""
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Class defining the generated outputs from [`SeamlessM4Tv2Model`], [`SeamlessM4Tv2ForTextToText`],
+    [`SeamlessM4Tv2ForTextToSpeech`], [`SeamlessM4Tv2ForSpeechToSpeech`] and [`SeamlessM4Tv2ForTextToSpeech`].
+    """
+)
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TGenerationOutput with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2GenerationOutput(ModelOutput):
+    r"""
+    waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+        The final audio waveform predicted by the model.
+    waveform_lengths (`torch.IntTensor` of shape `(batch_size,)`, *optional*):
+        The length in samples of each element in the `waveform` batch.
+    sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        The generated translated sequences. This is the output of the text-to-text or the speech-to-text models.
+        The second dimension (sequence_length) is either equal to `max_length` or shorter if all batches finished
+        early due to the `eos_token_id`.
+    unit_sequences (`torch.LongTensor` of shape `(batch_size, unit_sequence_length)`, *optional*):
+        The generated translated unit sequences. This is the output of the text-to-units model. The second
+        dimension (unit_sequence_length) is either equal to `t2u_max_length` or shorter if all batches finished
+        early due to the `t2u_eos_token_id`.
+    """
+
+    waveform: Optional[torch.FloatTensor] = None
+    waveform_lengths: Optional[torch.IntTensor] = None
+    sequences: Optional[tuple[torch.FloatTensor]] = None
+    unit_sequences: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Class defining the outputs from [`SeamlessM4Tv2TextToUnitDecoder`].
+    """
+)
+class SeamlessM4Tv2TextToUnitDecoderOutput(ModelOutput):
+    r"""
+    padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+        Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0
+        for *masked*
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    padding_mask: Optional[torch.Tensor] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Class defining the outputs from [`SeamlessM4Tv2TextToUnitForConditionalGeneration`] and
+        [`SeamlessM4Tv2TextToUnitModel`].
+    """
+)
+class SeamlessM4Tv2TextToUnitOutput(ModelOutput):
+    r"""
+    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+        Sequence of hidden-states at the output of the last layer of the decoder of the model.
+
+        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+        hidden_size)` is output.
+    padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+        Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0
+        for *masked*
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    padding_mask: Optional[torch.Tensor] = None
+    decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    loss: Optional[torch.FloatTensor] = None
+
+
+############ UTILS ################
+
+
+# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: torch.Tensor x:
+
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+    return incremental_indices.long() + padding_idx
+
+
+# Copied from transformers.models.bart.modeling_bart.shift_tokens_right
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    if pad_token_id is None:
+        raise ValueError("self.model.config.pad_token_id has to be defined.")
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
+def _compute_new_attention_mask(hidden_states: torch.Tensor, seq_lens: torch.Tensor):
+    """
+    Computes an attention mask of the form `(batch, seq_len)` with an attention for each element in the batch that
+    stops at the corresponding element in `seq_lens`.
+
+    Args:
+        hidden_states (`torch.FloatTensor` of shape `(batch, seq_len, *)`):
+            The sequences to mask, where `*` is any number of sequence-specific dimensions including none.
+        seq_lens (`torch.Tensor` of shape `(batch)`:
+            Each element represents the length of the sequence at the same index in `hidden_states`
+
+    Returns:
+        `torch.FloatTensor`: The float attention mask of shape `(batch, seq_len)`
+    """
+    batch_size, mask_seq_len = hidden_states.shape[:2]
+
+    indices = torch.arange(mask_seq_len, device=seq_lens.device).expand(batch_size, -1)
+
+    bool_mask = indices >= seq_lens.unsqueeze(1).expand(-1, mask_seq_len)
+
+    mask = hidden_states.new_ones((batch_size, mask_seq_len))
+
+    mask = mask.masked_fill(bool_mask, 0)
+
+    return mask
+
+
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.format_speech_generation_kwargs with SeamlessM4T->SeamlessM4Tv2
+def format_speech_generation_kwargs(kwargs):
+    """
+    Format kwargs for SeamlessM4Tv2 models that generate speech, attribute kwargs to either the text generation or the
+    speech generation models.
+
+    Args:
+        kwargs (`dict`)`:
+             Keyword arguments are of two types:
+
+                - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model,
+                except for `decoder_input_ids` which will only be passed through the text components.
+                - With a *text_* or *speech_* prefix, they will be input for the `generate` method of the
+                text model and speech model respectively. It has the priority over the keywords without a prefix.
+
+                This means you can, for example, specify a generation strategy for one generation but not for the
+                other.
+    """
+    # attribute kwargs to models
+    kwargs_text = {}
+    kwargs_speech = {}
+    for key, value in kwargs.items():
+        if key.startswith("text_"):
+            key = key[len("text_") :]
+            kwargs_text[key] = value
+        elif key.startswith("speech_"):
+            key = key[len("speech_") :]
+            kwargs_speech[key] = value
+        elif key == "generation_config":
+            kwargs_text[key] = value
+        else:
+            # If the key is already in a specific config, then it's been set with a
+            # submodules specific value and we don't override
+            if key not in kwargs_text:
+                kwargs_text[key] = value
+            if key not in kwargs_speech:
+                kwargs_speech[key] = value
+    return kwargs_text, kwargs_speech
+
+
+############ SPEECH ENCODER related code ################
+
+
+class SeamlessM4Tv2ConformerFeatureProjection(nn.Module):
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TConformerFeatureProjection.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.feature_projection_input_dim, eps=config.layer_norm_eps)
+        self.projection = nn.Linear(config.feature_projection_input_dim, config.hidden_size)
+        self.dropout = nn.Dropout(config.speech_encoder_dropout)
+
+    def forward(self, hidden_states):
+        # non-projected hidden states are needed for quantization
+        norm_hidden_states = self.layer_norm(hidden_states.to(self.layer_norm.weight.dtype))
+        hidden_states = self.projection(norm_hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TConformerFeedForward with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2ConformerFeedForward(nn.Module):
+    def __init__(self, config, act_fn=None, dropout=None):
+        super().__init__()
+        dropout = dropout if dropout is not None else config.speech_encoder_dropout
+        act_fn = act_fn if act_fn is not None else config.speech_encoder_hidden_act
+
+        self.intermediate_dropout = nn.Dropout(dropout)
+        self.intermediate_dense = nn.Linear(config.hidden_size, config.speech_encoder_intermediate_size)
+        self.intermediate_act_fn = ACT2FN[act_fn] if isinstance(act_fn, str) else act_fn
+
+        self.output_dense = nn.Linear(config.speech_encoder_intermediate_size, config.hidden_size)
+        self.output_dropout = nn.Dropout(dropout)
+
+    def forward(self, hidden_states):
+        hidden_states = self.intermediate_dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.intermediate_dropout(hidden_states)
+
+        hidden_states = self.output_dense(hidden_states)
+        hidden_states = self.output_dropout(hidden_states)
+        return hidden_states
+
+
+class SeamlessM4Tv2ConformerConvolutionModule(nn.Module):
+    """Convolution block used in the conformer block. Uses a causal depthwise convolution similar to that
+    described in Section 2.1 of https://huggingface.co/papers/1609.03499
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        if (config.conv_depthwise_kernel_size - 1) % 2 == 1:
+            raise ValueError("`config.conv_depthwise_kernel_size` should be a odd number for 'SAME' padding")
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.pointwise_conv1 = nn.Conv1d(
+            config.hidden_size,
+            2 * config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.glu = nn.GLU(dim=1)
+        self.depthwise_conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            config.conv_depthwise_kernel_size,
+            stride=1,
+            padding=0,
+            groups=config.hidden_size,
+            bias=False,
+        )
+        self.depthwise_layer_norm = nn.LayerNorm(config.hidden_size)
+        self.activation = ACT2FN[config.speech_encoder_hidden_act]
+        self.pointwise_conv2 = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.dropout = nn.Dropout(config.speech_encoder_dropout)
+
+    def forward(self, hidden_states, attention_mask=None):
+        hidden_states = self.layer_norm(hidden_states)
+
+        # Ensure that we do not leak padded positions in depthwise convolution.
+        # Put 0 where necessary
+        if attention_mask is not None:
+            hidden_states = hidden_states.masked_fill(~attention_mask.bool().unsqueeze(-1), 0.0)
+
+        # exchange the temporal dimension and the feature dimension
+        hidden_states = hidden_states.transpose(1, 2)
+
+        # GLU mechanism
+        # => (batch, 2*channel, dim)
+        hidden_states = self.pointwise_conv1(hidden_states)
+        # => (batch, channel, dim)
+        hidden_states = self.glu(hidden_states)
+
+        # Pad the sequence entirely on the left because of causal convolution.
+        hidden_states = torch.nn.functional.pad(hidden_states, (self.depthwise_conv.kernel_size[0] - 1, 0))
+
+        # 1D Depthwise Conv
+        hidden_states = self.depthwise_conv(hidden_states)
+        hidden_states = self.depthwise_layer_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = self.pointwise_conv2(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class SeamlessM4Tv2ConformerSelfAttention(nn.Module):
+    """Construct a SeamlessM4Tv2ConformerSelfAttention object.
+    Can be enhanced with relative position embeddings.
+    """
+
+    def __init__(self, config, use_position_embeddings=True):
+        super().__init__()
+
+        self.head_size = config.hidden_size // config.speech_encoder_attention_heads
+        self.num_heads = config.speech_encoder_attention_heads
+        self.position_embeddings_type = config.position_embeddings_type if use_position_embeddings else None
+
+        self.linear_q = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_k = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_v = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_out = nn.Linear(config.hidden_size, config.hidden_size)
+
+        self.dropout = nn.Dropout(p=config.speech_encoder_dropout)
+
+        if self.position_embeddings_type == "relative_key":
+            self.left_max_position_embeddings = config.left_max_position_embeddings
+            self.right_max_position_embeddings = config.right_max_position_embeddings
+            num_positions = self.left_max_position_embeddings + self.right_max_position_embeddings + 1
+            self.distance_embedding = nn.Embedding(num_positions, self.head_size)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        # self-attention mechanism
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+
+        # make sure query/key states can be != value states
+        query_key_states = hidden_states
+        value_states = hidden_states
+
+        # project query_key_states and value_states
+        query = self.linear_q(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        key = self.linear_k(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        value = self.linear_v(value_states).view(batch_size, -1, self.num_heads, self.head_size)
+
+        # => (batch, head, time1, d_k)
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+
+        attn_weights = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_size)
+
+        if self.position_embeddings_type == "relative_key":
+            query_length, key_length = query.shape[2], key.shape[2]
+
+            position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_r - position_ids_l
+            distance = torch.clamp(distance, -self.left_max_position_embeddings, self.right_max_position_embeddings)
+
+            positional_embedding = self.distance_embedding(distance + self.left_max_position_embeddings)
+            positional_embedding = positional_embedding.to(dtype=query.dtype)  # fp16 compatibility
+
+            relative_position_attn_weights = torch.einsum("bhld,lrd->bhlr", query, positional_embedding)
+            attn_weights = attn_weights + (relative_position_attn_weights / math.sqrt(self.head_size))
+
+        # apply attention_mask if necessary
+        if attention_mask is not None:
+            attn_weights = attn_weights + attention_mask
+
+        # => (batch, head, time1, time2)
+        attn_weights = torch.softmax(attn_weights, dim=-1)
+        attn_weights = self.dropout(attn_weights)
+
+        # => (batch, head, time1, d_k)
+        attn_output = torch.matmul(attn_weights, value)
+
+        # => (batch, time1, hidden_size)
+        attn_output = attn_output.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_size)
+        attn_output = self.linear_out(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights
+
+
+class SeamlessM4Tv2ConformerEncoderLayer(GradientCheckpointingLayer):
+    """Conformer block based on https://huggingface.co/papers/2005.08100."""
+
+    # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerEncoderLayer.__init__ with Wav2Vec2->SeamlessM4Tv2, attention_dropout->speech_encoder_dropout, torch.nn->nn
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        dropout = config.speech_encoder_dropout
+
+        # Feed-forward 1
+        self.ffn1_layer_norm = nn.LayerNorm(embed_dim)
+        self.ffn1 = SeamlessM4Tv2ConformerFeedForward(config)
+
+        # Self-Attention
+        self.self_attn_layer_norm = nn.LayerNorm(embed_dim)
+        self.self_attn_dropout = nn.Dropout(dropout)
+        self.self_attn = SeamlessM4Tv2ConformerSelfAttention(config)
+
+        # Conformer Convolution
+        self.conv_module = SeamlessM4Tv2ConformerConvolutionModule(config)
+
+        # Feed-forward 2
+        self.ffn2_layer_norm = nn.LayerNorm(embed_dim)
+        self.ffn2 = SeamlessM4Tv2ConformerFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        conv_attention_mask: Optional[torch.Tensor] = None,
+    ):
+        # 1. Feed-Forward 1 layer
+        residual = hidden_states
+        hidden_states = self.ffn1_layer_norm(hidden_states)
+        hidden_states = self.ffn1(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        residual = hidden_states
+
+        # 2. Self-Attention layer
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.self_attn_dropout(hidden_states)
+        hidden_states = hidden_states + residual
+
+        # 3. Convolutional Layer
+        residual = hidden_states
+        hidden_states = self.conv_module(hidden_states, attention_mask=conv_attention_mask)
+        hidden_states = residual + hidden_states
+
+        # 4. Feed-Forward 2 Layer
+        residual = hidden_states
+        hidden_states = self.ffn2_layer_norm(hidden_states)
+        hidden_states = self.ffn2(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        return hidden_states, attn_weights
+
+
+class SeamlessM4Tv2ConformerEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        self.dropout = nn.Dropout(config.speech_encoder_dropout)
+        self.layers = nn.ModuleList(
+            [SeamlessM4Tv2ConformerEncoderLayer(config) for _ in range(config.speech_encoder_layers)]
+        )
+
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.gradient_checkpointing = False
+
+    def _apply_chunk_attention(self, attention_mask, hidden_states):
+        """
+        Creates a chunk attention mask. It creates a mask to prevent attention across chunks, ensuring that each
+        position attends only to positions within its own chunk. If a left chunk overlap is specified
+        (`speech_encoder_chunk_size` in the configuration), the attention mask is adjusted accordingly to allow each
+        position to also attends the `speech_encoder_chunk_size - 1` previous chunks.
+        """
+        sequence_len = hidden_states.shape[1]
+
+        chunk_indices = torch.arange(sequence_len, device=hidden_states.device)
+        chunk_indices = torch.div(chunk_indices, self.config.speech_encoder_chunk_size).long()
+
+        start_indices = torch.full_like(chunk_indices, 0)
+        if self.config.speech_encoder_left_chunk_num >= 0:
+            start_indices = (chunk_indices - self.config.speech_encoder_left_chunk_num).clamp_(min=0)
+            start_indices = start_indices * self.config.speech_encoder_chunk_size
+        start_indices = start_indices.unsqueeze(1).expand(-1, sequence_len)
+
+        end_indices = ((chunk_indices + 1) * self.config.speech_encoder_chunk_size).clamp_(max=sequence_len)
+
+        end_indices = end_indices.unsqueeze(1).expand(-1, sequence_len)
+
+        indices = torch.arange(sequence_len, device=hidden_states.device).unsqueeze(0).expand(sequence_len, -1)
+
+        chunk_mask = (indices < start_indices) | (indices >= end_indices)
+        chunk_mask = chunk_mask.unsqueeze(0).unsqueeze(0)
+
+        attention_mask = chunk_mask if attention_mask is None else (attention_mask.bool() | chunk_mask)
+        attention_mask = attention_mask.to(dtype=hidden_states.dtype)
+        return attention_mask
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        conv_attention_mask = attention_mask
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            hidden_states = hidden_states.masked_fill(~attention_mask.bool().unsqueeze(-1), 0.0)
+            # extend attention_mask
+            attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+            attention_mask = attention_mask.expand(
+                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+            )
+
+        if self.config.speech_encoder_chunk_size is not None:
+            attention_mask = self._apply_chunk_attention(attention_mask, hidden_states)
+
+        if attention_mask is not None:
+            attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
+
+        hidden_states = self.dropout(hidden_states)
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = self.training and dropout_probability < self.config.speech_encoder_layerdrop
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                layer_outputs = layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    output_attentions=output_attentions,
+                    conv_attention_mask=conv_attention_mask,
+                )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TConformerAdapterLayer with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2ConformerAdapterLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        dropout = config.adaptor_dropout
+
+        self.kernel_size = config.adaptor_kernel_size
+        self.stride = config.adaptor_stride
+
+        # 1. residual convolution
+        self.residual_layer_norm = nn.LayerNorm(embed_dim)
+        self.residual_conv = nn.Conv1d(
+            embed_dim,
+            2 * embed_dim,
+            self.kernel_size,
+            stride=self.stride,
+            padding=self.stride // 2,
+        )
+        self.activation = nn.GLU(dim=1)
+
+        # Self-Attention
+        self.self_attn_layer_norm = nn.LayerNorm(embed_dim)
+        self.self_attn_conv = nn.Conv1d(
+            embed_dim,
+            2 * embed_dim,
+            self.kernel_size,
+            stride=self.stride,
+            padding=self.stride // 2,
+        )
+        self.self_attn = SeamlessM4Tv2ConformerSelfAttention(config, use_position_embeddings=False)
+        self.self_attn_dropout = nn.Dropout(dropout)
+
+        # Feed-forward
+        self.ffn_layer_norm = nn.LayerNorm(embed_dim)
+        self.ffn = SeamlessM4Tv2ConformerFeedForward(config, act_fn="relu", dropout=dropout)
+
+    def _compute_sub_sample_lengths_from_attention_mask(self, attention_mask):
+        pad = self.kernel_size // 2
+        seq_lens = attention_mask.size(1) - (1 - attention_mask.int()).sum(1)
+
+        seq_lens = ((seq_lens + 2 * pad - self.kernel_size) / self.stride) + 1
+
+        return seq_lens.floor()
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ):
+        residual = self.residual_layer_norm(hidden_states)
+
+        # Apply pooling to the residual to match the sequence length of the
+        # multi-head attention output.
+        # (batch, seq_len, feature_dim) -> (batch, feature_dim, seq_len)
+        residual = residual.transpose(1, 2)
+        residual = self.residual_conv(residual)
+        residual = self.activation(residual)
+        # (batch, feature_dim, seq_len) -> (batch, seq_len, feature_dim)
+        residual = residual.transpose(1, 2)
+
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        # Apply pooling before feeding to the multihead-attention layer.
+        # (batch, seq_len, feature_dim) -> (batch, feature_dim, seq_len)
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.self_attn_conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        # (batch, feature_dim, seq_len) -> (batch, seq_len, feature_dim)
+        hidden_states = hidden_states.transpose(1, 2)
+
+        if attention_mask is not None:
+            sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
+                hidden_states.device
+            )
+            attention_mask = _compute_new_attention_mask(hidden_states=hidden_states, seq_lens=sub_sampled_lengths)
+            attention_mask = _prepare_4d_attention_mask(
+                attention_mask,
+                hidden_states.dtype,
+            )
+
+        # The rest of the computation is identical to a vanilla Transformer
+        # encoder layer.
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.self_attn_dropout(hidden_states)
+        hidden_states = hidden_states + residual
+
+        residual = hidden_states
+
+        hidden_states = self.ffn_layer_norm(hidden_states)
+        hidden_states = self.ffn(hidden_states) + residual
+
+        return hidden_states
+
+
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TConformerAdapter with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2ConformerAdapter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.layers = nn.ModuleList(
+            SeamlessM4Tv2ConformerAdapterLayer(config) for _ in range(config.num_adapter_layers)
+        )
+
+    def forward(self, hidden_states, attention_mask):
+        # down project hidden_states if necessary
+
+        for layer in self.layers:
+            hidden_states = layer(hidden_states, attention_mask)
+
+        return hidden_states
+
+
+############ TEXT / UNITS related code ################
+
+
+# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100ScaledWordEmbedding with M2M100->SeamlessM4Tv2
+class SeamlessM4Tv2ScaledWordEmbedding(nn.Embedding):
+    """
+    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: Optional[float] = 1.0):
+        super().__init__(num_embeddings, embedding_dim, padding_idx)
+        self.embed_scale = embed_scale
+
+    def forward(self, input_ids: torch.Tensor):
+        return super().forward(input_ids) * self.embed_scale
+
+
+# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding
+class SeamlessM4Tv2SinusoidalPositionalEmbedding(nn.Module):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        super().__init__()
+        self.offset = 2
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
+
+    def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
+        if hasattr(self, "weights"):
+            # in forward put the weights on the correct dtype and device of the param
+            emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
+
+        self.register_buffer("weights", emb_weights, persistent=False)
+
+    @staticmethod
+    def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        """
+        Build sinusoidal embeddings.
+
+        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
+        "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        if padding_idx is not None:
+            emb[padding_idx, :] = 0
+
+        return emb.to(torch.get_default_dtype())
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values_length: int = 0,
+    ):
+        if input_ids is not None:
+            bsz, seq_len = input_ids.size()
+            # Create the position ids from the input token ids. Any padded tokens remain padded.
+            position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
+                input_ids.device
+            )
+        else:
+            bsz, seq_len = inputs_embeds.size()[:-1]
+            position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, past_key_values_length)
+
+        # expand embeddings if needed
+        max_pos = self.padding_idx + 1 + seq_len + past_key_values_length
+        if max_pos > self.weights.size(0):
+            self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
+
+        return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach()
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds, past_key_values_length):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: torch.Tensor
+
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape).contiguous() + past_key_values_length
+
+
+class SeamlessM4Tv2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    # Copied from transformers.models.bart.modeling_bart.BartAttention.__init__ with Bart->SeamlessM4Tv2
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        is_causal: bool = False,
+        config: Optional[SeamlessM4Tv2Config] = None,
+        layer_idx: Optional[int] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.config = config
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.is_causal = is_causal
+        self.layer_idx = layer_idx
+        if layer_idx is None and self.is_decoder:
+            logger.warning_once(
+                f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and "
+                "will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        is_cross_attention = encoder_hidden_states is not None
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        is_updated = False
+        if past_key_values is not None:
+            if isinstance(past_key_values, EncoderDecoderCache):
+                is_updated = past_key_values.is_updated.get(self.layer_idx)
+                if is_cross_attention:
+                    # after the first generated id, we can subsequently re-use all key/value_states from cache
+                    curr_past_key_value = past_key_values.cross_attention_cache
+                else:
+                    curr_past_key_value = past_key_values.self_attention_cache
+            else:
+                curr_past_key_value = past_key_values
+
+        current_states = encoder_hidden_states if is_cross_attention else hidden_states
+        if is_cross_attention and past_key_values is not None and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = curr_past_key_value.layers[self.layer_idx].keys
+            value_states = curr_past_key_value.layers[self.layer_idx].values
+        else:
+            key_states = self.k_proj(current_states)
+            value_states = self.v_proj(current_states)
+            key_states = key_states.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+            value_states = value_states.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+
+            if past_key_values is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = curr_past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache):
+                    past_key_values.is_updated[self.layer_idx] = True
+
+        query_states = self.q_proj(hidden_states)
+        query_states = query_states.reshape(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        query_states = query_states * self.scaling
+        attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2))
+
+        if attention_mask is not None:
+            attention_scores = attention_scores + attention_mask
+
+        # (batch_size, n_heads, seq_length, key_length)
+        attn_weights = nn.functional.softmax(attention_scores.float(), dim=-1).type_as(attention_scores)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        #  attn_output = torch.bmm(attn_probs, value_states) ?
+        context_states = torch.matmul(attn_weights, value_states)
+        # attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) ?
+        context_states = context_states.permute(0, 2, 1, 3).contiguous().view(batch_size, seq_length, -1)
+        attn_output = self.out_proj(context_states)
+
+        return attn_output, attn_weights
+
+
+# Copied from transformers.models.nllb_moe.modeling_nllb_moe.NllbMoeDenseActDense with NllbMoe->SeamlessM4Tv2,DenseActDense->FeedForwardNetwork, d_model->hidden_size
+class SeamlessM4Tv2FeedForwardNetwork(nn.Module):
+    def __init__(self, config: SeamlessM4Tv2Config, ffn_dim: int):
+        super().__init__()
+        self.fc1 = nn.Linear(config.hidden_size, ffn_dim)
+        self.fc2 = nn.Linear(ffn_dim, config.hidden_size)
+        self.dropout = nn.Dropout(config.activation_dropout)
+        self.act = ACT2FN[config.activation_function]
+
+    def forward(self, hidden_states):
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        if (
+            isinstance(self.fc2.weight, torch.Tensor)
+            and hidden_states.dtype != self.fc2.weight.dtype
+            and (self.fc2.weight.dtype != torch.int8 and self.fc2.weight.dtype != torch.uint8)
+        ):
+            hidden_states = hidden_states.to(self.fc2.weight.dtype)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TEncoderLayer with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2EncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: SeamlessM4Tv2Config, encoder_ffn_dim=None, encoder_attention_heads=None):
+        super().__init__()
+        encoder_ffn_dim = config.encoder_ffn_dim if encoder_ffn_dim is None else encoder_ffn_dim
+        encoder_attention_heads = (
+            config.encoder_attention_heads if encoder_attention_heads is None else encoder_attention_heads
+        )
+
+        self.embed_dim = config.hidden_size
+        self.self_attn = SeamlessM4Tv2Attention(
+            embed_dim=self.embed_dim,
+            num_heads=encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+
+        self.ffn = SeamlessM4Tv2FeedForwardNetwork(config, ffn_dim=encoder_ffn_dim)
+
+        self.ffn_layer_norm = nn.LayerNorm(config.hidden_size)
+        self.ffn_dropout = nn.Dropout(config.activation_dropout)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: bool = False,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`):
+                attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
+                large negative values.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.attn_dropout(hidden_states)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+
+        hidden_states = self.ffn_layer_norm(hidden_states)
+
+        hidden_states = self.ffn(hidden_states)
+        hidden_states = self.ffn_dropout(hidden_states)
+
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TDecoderLayer with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2DecoderLayer(GradientCheckpointingLayer):
+    def __init__(
+        self, config: SeamlessM4Tv2Config, decoder_ffn_dim=None, decoder_attention_heads=None, layer_idx=None
+    ):
+        super().__init__()
+        decoder_ffn_dim = config.decoder_ffn_dim if decoder_ffn_dim is None else decoder_ffn_dim
+        decoder_attention_heads = (
+            config.decoder_attention_heads if decoder_attention_heads is None else decoder_attention_heads
+        )
+
+        self.embed_dim = config.hidden_size
+        self.self_attn = SeamlessM4Tv2Attention(
+            embed_dim=self.embed_dim,
+            num_heads=decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            layer_idx=layer_idx,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.attn_dropout = nn.Dropout(config.dropout)
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.cross_attention = SeamlessM4Tv2Attention(
+            self.embed_dim,
+            decoder_attention_heads,
+            config.attention_dropout,
+            is_decoder=True,
+            layer_idx=layer_idx,
+        )
+        self.cross_attention_layer_norm = nn.LayerNorm(self.embed_dim)
+
+        self.ffn = SeamlessM4Tv2FeedForwardNetwork(config, ffn_dim=decoder_ffn_dim)
+
+        self.ffn_layer_norm = nn.LayerNorm(config.hidden_size)
+        self.ffn_dropout = nn.Dropout(config.activation_dropout)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`):
+                attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
+                large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`):
+                encoder attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by
+                very large negative values.
+            past_key_values (`Cache`):
+                cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+        hidden_states = self.attn_dropout(hidden_states)
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            hidden_states = self.cross_attention_layer_norm(hidden_states)
+
+            hidden_states, cross_attn_weights = self.cross_attention(
+                hidden_states=hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                past_key_values=past_key_values,
+                attention_mask=encoder_attention_mask,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+            hidden_states = self.attn_dropout(hidden_states)
+            hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+
+        hidden_states = self.ffn_layer_norm(hidden_states)
+
+        hidden_states = self.ffn(hidden_states)
+        hidden_states = self.ffn_dropout(hidden_states)
+
+        hidden_states = residual + hidden_states
+
+        return hidden_states, self_attn_weights, cross_attn_weights
+
+
+class SeamlessM4Tv2TextToUnitDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: SeamlessM4Tv2Config, decoder_ffn_dim=None, decoder_attention_heads=None):
+        super().__init__()
+        decoder_ffn_dim = config.decoder_ffn_dim if decoder_ffn_dim is None else decoder_ffn_dim
+        decoder_attention_heads = (
+            config.decoder_attention_heads if decoder_attention_heads is None else decoder_attention_heads
+        )
+        self.dropout = config.dropout
+        self.embed_dim = config.hidden_size
+
+        self.self_attn = SeamlessM4Tv2Attention(
+            embed_dim=self.embed_dim,
+            num_heads=decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+
+        self.conv1 = nn.Conv1d(self.embed_dim, self.embed_dim, kernel_size=7, stride=1, padding="same")
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.conv2 = nn.Conv1d(self.embed_dim, self.embed_dim, kernel_size=7, stride=1, padding="same")
+
+        self.conv_layer_norm = nn.LayerNorm(config.hidden_size)
+        self.conv_dropout = nn.Dropout(self.dropout)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        padding_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`):
+                attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
+                large negative values.
+            padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked*
+                or 0 for *masked*
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Conv
+        residual = hidden_states
+
+        # Apply padding mask to avoid leaking padded positions in the convolution layer
+        if padding_mask is not None:
+            hidden_states = hidden_states.masked_fill(~padding_mask.bool().unsqueeze(-1), 0.0)
+        hidden_states = self.conv1(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        if padding_mask is not None:
+            hidden_states = hidden_states.masked_fill(~padding_mask.bool().unsqueeze(-1), 0.0)
+
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.conv2(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        hidden_states = self.conv_dropout(hidden_states)
+        hidden_states = residual + hidden_states
+        hidden_states = self.conv_layer_norm(hidden_states)
+
+        return hidden_states, self_attn_weights
+
+
+############ SUB-MODELS related code ################
+
+
+@auto_docstring
+class SeamlessM4Tv2PreTrainedModel(PreTrainedModel):
+    config: SeamlessM4Tv2Config
+    base_model_prefix = "seamless_m4t_v2"
+    supports_gradient_checkpointing = True
+    _no_split_modules = [
+        "SeamlessM4Tv2EncoderLayer",
+        "SeamlessM4Tv2DecoderLayer",
+        "SeamlessM4Tv2ConformerEncoderLayer",
+        "SeamlessM4Tv2TextToUnitDecoderLayer",
+    ]
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights"""
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, SeamlessM4Tv2ConformerSelfAttention):
+            if hasattr(module, "pos_bias_u"):
+                nn.init.xavier_uniform_(module.pos_bias_u)
+            if hasattr(module, "pos_bias_v"):
+                nn.init.xavier_uniform_(module.pos_bias_v)
+        elif isinstance(module, SeamlessM4Tv2ConformerFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, SeamlessM4Tv2TextToUnitDecoder):
+            module.pos_emb_alpha_char.data.fill_(1)
+            module.pos_emb_alpha.data.fill_(1)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, (nn.Conv1d, nn.ConvTranspose1d)):
+            nn.init.kaiming_normal_(module.weight)
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TPreTrainedModel._compute_sub_sample_lengths_from_attention_mask
+    def _compute_sub_sample_lengths_from_attention_mask(self, attention_mask):
+        kernel_size, stride = self.config.adaptor_kernel_size, self.config.adaptor_stride
+        pad = kernel_size // 2
+        seq_lens = attention_mask.size(1) - (1 - attention_mask.int()).sum(1)
+
+        seq_lens = ((seq_lens + 2 * pad - kernel_size) / stride) + 1
+
+        return seq_lens.floor()
+
+    def _indices_to_subwords(self, input_ids):
+        """
+        Returns the corresponding text string for each input id.
+        """
+        if not hasattr(self.generation_config, "id_to_text"):
+            raise ValueError(
+                """This model generation config doesn't have a `id_to_text` key which maps
+                token ids to subwords. Make sure to load the right generation config."""
+            )
+        batch_size, sequence_len = input_ids.shape
+
+        subwords_batch = []
+        for batch_id in range(batch_size):
+            subwords = []
+            for i in range(sequence_len):
+                subword = self.generation_config.id_to_text.get(str(input_ids[batch_id, i].item()))
+                subwords.append(str(subword))
+            subwords_batch.append(subwords)
+        return subwords_batch
+
+    def _count_character_length_in_subword(
+        self,
+        input_ids,
+        subwords_batch,
+        merge_space_with_prev_subword=False,
+        pad_token_id=0,
+        unk_token_id=1,
+        space="▁",
+    ):
+        """
+        Counts the number of characters per text string associated with the input token id.
+
+        Args:
+            input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+            subwords_batch (`list[list[str]]` of shape `(batch_size, sequence_length)`):
+                Corresponding text string for each input id.
+            merge_space_with_prev_subword (`bool`, *optional*, defaults to `False`):
+                Indicates if the space character is merged with the previous subword. If `False`, it will be merged
+                with the next subword.
+            pad_token_id (`int`, *optional*, defaults to 0):
+                The id of the _padding_ text token. If it is encountered when calculating the length of a subword
+                sample, the lengths of subsequent subwords will be set to 0.
+            unk_token_id (`int`, *optional*, defaults to 1):
+                The id of the _unknown_ text token. Associated to a subword of length 1.
+            space (`str`, *optional*, defaults to `"▁"`):
+                The space character.
+        """
+        batch_size, _ = input_ids.shape
+
+        char_count_per_id = input_ids.new_zeros(input_ids.size())
+
+        subword_lens = input_ids.ne(pad_token_id).sum(1)
+
+        for batch_id in range(batch_size):
+            # We slice out the tensor till the padding index.
+            subword_indices = input_ids[batch_id, : subword_lens[batch_id]]
+            subwords = subwords_batch[batch_id][: subword_lens[batch_id]]
+
+            is_next_start_with_space = [
+                len(subwords[i + 1]) > 1 and subwords[i + 1][0] == space if i < len(subwords) - 1 else False
+                for i in range(len(subwords))
+            ]
+            is_punc = [
+                len(subwords[i]) == 1
+                and not subwords[i].isalpha()
+                and not subwords[i].isnumeric()
+                and subwords[i] != space
+                for i in range(len(subwords))
+            ]
+            for i, (subword_idx, subword) in enumerate(zip(subword_indices, subwords)):
+                if subword_idx == pad_token_id:
+                    break
+
+                if subword_idx == unk_token_id:
+                    # We set char_len to 1 for an unk token.
+                    char_len = 1
+
+                    if merge_space_with_prev_subword and is_next_start_with_space[i]:
+                        char_len += 1
+                else:
+                    # By default, spaces are merged with the next subword.
+                    # char_len includes the space.
+                    char_len = len(subword)
+
+                    if merge_space_with_prev_subword:
+                        # Add the space for the next subword.
+                        if is_next_start_with_space[i]:
+                            char_len += 1
+                        # Subtract the space for the current subword.
+                        if i > 0 and is_next_start_with_space[i - 1]:
+                            char_len -= 1
+                    else:
+                        # Merge space with punctuation mark by default.
+                        if is_punc[i] and is_next_start_with_space[i]:
+                            char_len += 1
+                        # Subtract the space for the subword succeeding the punctuation mark.
+                        elif i > 0 and is_punc[i - 1] and is_next_start_with_space[i - 1]:
+                            char_len -= 1
+
+                char_count_per_id[batch_id, i] = char_len
+
+        return char_count_per_id
+
+    def _get_char_input_ids(self, input_ids, subwords_batch, char_count_per_id, pad_token_id=0, unk_token_id=1):
+        """
+        Returns the corresponding character input id for each character of `subwords_batch`.
+
+        Args:
+            input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+            subwords_batch (`list[list[str]]` of shape `(batch_size, sequence_length)`):
+                Corresponding text string for each input id.
+            char_count_per_id (`torch.Tensor` of shape `(batch_size, sequence_length)`):
+                Number of characters per input id.
+            pad_token_id (`int`, *optional*, defaults to 0):
+                The id of the _padding_ text token. If it is encountered when calculating the length of a subword
+                sample, the lengths of subsequent subwords will be set to 0.
+            unk_token_id (`int`, *optional*, defaults to 1):
+                The id of the _unknown_ text token. Associated to a subword of length 1.
+        Returns:
+            `torch.Tensor`: Tensor of shape `(batch_size, char_sequence_length)` containing the id of each character.
+        """
+        if not hasattr(self.generation_config, "char_to_id"):
+            raise ValueError(
+                """This model generation config doesn't have a `char_to_id` key which maps
+                characters to character ids. Make sure to load the right generation config."""
+            )
+
+        batch_size = input_ids.shape[0]
+        max_len = int(char_count_per_id.sum(1).max().item())
+
+        char_seqs = input_ids.new_zeros((batch_size, max_len)).fill_(pad_token_id)
+
+        subword_lens = input_ids.ne(pad_token_id).sum(1)
+
+        for batch_id in range(batch_size):
+            total = 0
+            subword_indices = input_ids[batch_id, : subword_lens[batch_id]]
+            subwords = subwords_batch[batch_id][: subword_lens[batch_id]]
+            for subword_idx, subword in zip(subword_indices, subwords):
+                if subword_idx == unk_token_id:
+                    char_ids = [unk_token_id]
+                else:
+                    # Get char token indices corresponding to the subwords.
+                    char_ids = [self.generation_config.char_to_id.get(ch, unk_token_id) for ch in list(subword)]
+                char_seq_len = len(char_ids)
+                char_seqs[batch_id, total : total + char_seq_len] = torch.tensor(char_ids).to(char_seqs)
+                total += char_seq_len
+        return char_seqs
+
+    def _hard_upsample(self, hidden_states, durations):
+        """
+        Repeats the time dimension of each sample in the batch based on the corresponding duration.
+
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, *)`, *optional*):
+                The sequence to repeat, where `*` is any number of sequence-specific dimensions including none.
+            durations (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indicates how many times to repeat time segments.
+        """
+        if hidden_states.size(0) == 1:
+            hidden_states = torch.repeat_interleave(hidden_states, durations.view(-1), dim=1)
+        else:
+            # if batched sample, need to interleave per sample, and pad -> loss of parallelism
+            if hidden_states.shape[0] > 1 and self.training:
+                logger.warning_once(
+                    """`self.training=True` and you use batching. You lose parallelism during the hifigan
+                               forward pass because the samples are interleaved."""
+                )
+            hidden_states = [
+                torch.repeat_interleave(hidden_state, duration, dim=0)
+                for (hidden_state, duration) in zip(hidden_states, durations)
+            ]
+
+            hidden_states = nn.utils.rnn.pad_sequence(hidden_states, batch_first=True)
+
+        return hidden_states
+
+
+@auto_docstring(
+    custom_intro="""
+    Transformer speech encoder consisting of *config.speech_encoder_layers* conformer self attention layers.
+    Each layer is a [`SeamlessM4Tv2ConformerEncoderLayer`].
+    """
+)
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TSpeechEncoder with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2SpeechEncoder(SeamlessM4Tv2PreTrainedModel):
+    main_input_name = "input_features"
+
+    def __init__(self, config: SeamlessM4Tv2Config):
+        super().__init__(config)
+
+        self.feature_projection = SeamlessM4Tv2ConformerFeatureProjection(config)
+        self.encoder = SeamlessM4Tv2ConformerEncoder(config)
+        self.intermediate_ffn = SeamlessM4Tv2ConformerFeedForward(config, act_fn="relu", dropout=0.0)
+        self.adapter = SeamlessM4Tv2ConformerAdapter(config) if config.add_adapter else None
+        self.inner_layer_norm = nn.LayerNorm(config.hidden_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_features: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple, Wav2Vec2BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_features is None:
+            raise ValueError(
+                """Both `input_features` and `inputs_embeds` are `None` in `SeamlessM4Tv2SpeechEncoder.forward`.
+                Make sure one of them is not `None`."""
+            )
+
+        hidden_states = self.feature_projection(input_features)
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        expanded_hidden_states = self.intermediate_ffn(hidden_states)
+        hidden_states = hidden_states + 0.5 * expanded_hidden_states
+
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states, attention_mask=attention_mask)
+
+        hidden_states = self.inner_layer_norm(hidden_states)
+
+        if not return_dict:
+            return (hidden_states,) + encoder_outputs[1:]
+
+        return Wav2Vec2BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+# inspired from MBart and NllbMoe
+@auto_docstring(
+    custom_intro="""
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a [`SeamlessM4Tv2EncoderLayer`].
+    """
+)
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TEncoder with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2Encoder(SeamlessM4Tv2PreTrainedModel):
+    def __init__(
+        self,
+        config: SeamlessM4Tv2Config,
+        embed_tokens: Optional[nn.Embedding] = None,
+        is_t2u_encoder: bool = False,
+    ):
+        r"""
+        embed_tokens (`nn.Embedding`, *optional*):
+            Input embedding
+        is_t2u_encoder (`bool`, *optional*, defaults to `False`):
+            indicates if it belongs to the text-to-units model, in which case it won't have input embeddings
+        """
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        embed_dim = config.hidden_size
+
+        self.is_t2u_encoder = is_t2u_encoder
+        self.max_source_positions = config.max_position_embeddings
+
+        if not self.is_t2u_encoder:
+            embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+
+            self.embed_tokens = SeamlessM4Tv2ScaledWordEmbedding(
+                config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale
+            )
+
+            if embed_tokens is not None:
+                self.embed_tokens.weight = embed_tokens.weight
+
+            self.embed_positions = SeamlessM4Tv2SinusoidalPositionalEmbedding(
+                self.max_source_positions,
+                embed_dim,
+                self.padding_idx,
+            )
+
+        layers = []
+        for _ in range(config.encoder_layers):
+            layers.append(
+                SeamlessM4Tv2EncoderLayer(
+                    config,
+                    encoder_attention_heads=config.encoder_attention_heads,
+                    encoder_ffn_dim=config.encoder_ffn_dim,
+                )
+            )
+
+        self.layers = nn.ModuleList(layers)
+
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple, BaseModelOutput]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and self.is_t2u_encoder:
+            raise ValueError(
+                "You cannot pass input_ids to the encoder of the text_to_units model. Pass inputs_embeds instead."
+            )
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input = input_ids
+            input_shape = input.shape
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input = inputs_embeds[:, :, -1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if not self.is_t2u_encoder:
+            embed_pos = self.embed_positions(input)
+
+            hidden_states = inputs_embeds + embed_pos.to(inputs_embeds.device)
+        else:
+            hidden_states = inputs_embeds
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
+                layer_outputs = (None, None)
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`SeamlessM4Tv2DecoderLayer`].
+    """
+)
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TDecoder with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2Decoder(SeamlessM4Tv2PreTrainedModel):
+    def __init__(
+        self,
+        config: SeamlessM4Tv2Config,
+        embed_tokens: Optional[nn.Embedding] = None,
+    ):
+        r"""
+        embed_tokens (`nn.Embedding`, *optional*):
+            Input embedding
+        """
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.max_target_positions = config.max_position_embeddings
+        embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0
+
+        if embed_tokens is not None:
+            # if embed_tokens defined, use its shape instead
+            self.embed_tokens = SeamlessM4Tv2ScaledWordEmbedding(
+                embed_tokens.num_embeddings, embed_tokens.embedding_dim, self.padding_idx, embed_scale=embed_scale
+            )
+            self.embed_tokens.weight = embed_tokens.weight
+        else:
+            self.embed_tokens = SeamlessM4Tv2ScaledWordEmbedding(
+                self.vocab_size, config.hidden_size, self.padding_idx, embed_scale=embed_scale
+            )
+
+        self.embed_positions = SeamlessM4Tv2SinusoidalPositionalEmbedding(
+            self.max_target_positions,
+            config.hidden_size,
+            padding_idx=self.padding_idx,
+        )
+
+        layers = []
+        for i in range(config.decoder_layers):
+            layers.append(
+                SeamlessM4Tv2DecoderLayer(
+                    config,
+                    decoder_attention_heads=config.decoder_attention_heads,
+                    decoder_ffn_dim=config.decoder_ffn_dim,
+                    layer_idx=i,
+                )
+            )
+        self.layers = nn.ModuleList(layers)
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input = input_ids
+            input_shape = input.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            input = inputs_embeds[:, :, -1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # initialize `past_key_values`
+        if use_cache and past_key_values is None:
+            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
+        if use_cache and isinstance(past_key_values, tuple):
+            logger.warning_once(
+                "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. "
+                "You should pass an instance of `EncoderDecoderCache` instead, e.g. "
+                "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`."
+            )
+            past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
+
+        past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+        attention_mask = _prepare_4d_causal_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _prepare_4d_attention_mask(
+                encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+
+        # embed positions
+        positions = self.embed_positions(input, past_key_values_length=past_key_values_length)
+
+        hidden_states = inputs_embeds + positions.to(inputs_embeds.device)
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask,
+                encoder_hidden_states,  # as a positional argument for gradient checkpointing
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, past_key_values, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`SeamlessM4Tv2DecoderLayer`].
+    """
+)
+class SeamlessM4Tv2TextToUnitDecoder(SeamlessM4Tv2PreTrainedModel):
+    def __init__(
+        self,
+        config: SeamlessM4Tv2Config,
+        embed_tokens: Optional[nn.Embedding] = None,
+    ):
+        r"""
+        embed_tokens (`nn.Embedding`, *optional*):
+            Input embedding
+        """
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.max_target_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0
+
+        if embed_tokens is not None:
+            # if embed_tokens defined, use its shape instead
+            self.embed_tokens = nn.Embedding(embed_tokens.num_embeddings, embed_tokens.embedding_dim, self.padding_idx)
+            self.embed_tokens.weight = embed_tokens.weight
+        else:
+            self.embed_tokens = nn.Embedding(self.vocab_size, config.hidden_size, self.padding_idx)
+
+        self.embed_char = nn.Embedding(config.char_vocab_size, config.hidden_size)
+        self.embed_char_positions = SeamlessM4Tv2SinusoidalPositionalEmbedding(
+            self.max_target_positions,
+            config.hidden_size,
+            padding_idx=self.padding_idx,
+        )
+
+        self.pos_emb_alpha_char = nn.Parameter(torch.ones(1))
+        self.pos_emb_alpha = nn.Parameter(torch.ones(1))
+        self.duration_predictor = SeamlessM4Tv2VariancePredictor(
+            config.variance_predictor_embed_dim,
+            config.variance_predictor_hidden_dim,
+            config.variance_predictor_kernel_size,
+            config.variance_pred_dropout,
+        )
+
+        self.embed_positions = SeamlessM4Tv2SinusoidalPositionalEmbedding(
+            self.max_target_positions,
+            config.hidden_size,
+            padding_idx=self.padding_idx,
+        )
+
+        layers = []
+        for _ in range(config.decoder_layers):
+            layers.append(
+                SeamlessM4Tv2TextToUnitDecoderLayer(
+                    config,
+                    decoder_attention_heads=config.decoder_attention_heads,
+                    decoder_ffn_dim=config.decoder_ffn_dim,
+                )
+            )
+        self.layers = nn.ModuleList(layers)
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        char_input_ids: Optional[torch.LongTensor] = None,
+        char_count_per_id: Optional[torch.LongTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, SeamlessM4Tv2TextToUnitDecoderOutput]:
+        r"""
+        Args:
+            char_input_ids (`torch.LongTensor` of shape `(batch_size, char_sequence_length)`):
+                Character indices. The correspondence between characters and indices can be found in `char_to_id`, a
+                dictionary in the generation configuration.
+            char_count_per_id (`torch.Tensor` of shape `(batch_size, encoder_sequence_length)`):
+                Number of characters per text input id.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # create padding mask for character lengths
+        char_padding_mask = _compute_new_attention_mask(char_input_ids, char_count_per_id.sum(1))
+
+        # upsample hidden states according to characters sequence lengths
+        char_hidden_states = self._hard_upsample(encoder_hidden_states, char_count_per_id)
+        # embed char positions
+        char_positions = self.pos_emb_alpha_char * self.embed_char_positions(inputs_embeds=char_hidden_states)
+        # update char hidden states with positions and char embeddings
+        char_hidden_states = self.embed_char(char_input_ids) * self.embed_scale + char_positions + char_hidden_states
+
+        # predict duration
+        log_dur_pred = self.duration_predictor(char_hidden_states, padding_mask=char_padding_mask)
+        dur_out = torch.clamp(torch.round(torch.expm1(log_dur_pred)).long(), min=1)
+        dur_out = dur_out.masked_fill(~char_padding_mask.bool(), 0.0)
+
+        # upsample char hidden states according to predicted duration
+        char_hidden_states = self._hard_upsample(char_hidden_states, dur_out)
+
+        positions = self.pos_emb_alpha * self.embed_positions(inputs_embeds=char_hidden_states)
+        hidden_states = char_hidden_states + positions
+
+        padding_mask = _compute_new_attention_mask(hidden_states, dur_out.sum(1))
+        attention_mask = _prepare_4d_attention_mask(padding_mask, hidden_states.dtype)
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                padding_mask=padding_mask,
+                output_attentions=output_attentions,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[2],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attns, padding_mask] if v is not None)
+        return SeamlessM4Tv2TextToUnitDecoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            padding_mask=padding_mask,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Transformer bare text-to-unit encoder-decoder. The encoder is a [`SeamlessM4Tv2Encoder`] without embeddings and the decoder is a [`SeamlessM4Tv2TextToUnitDecoder`].
+    """
+)
+class SeamlessM4Tv2TextToUnitModel(SeamlessM4Tv2PreTrainedModel):
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitModel.__init__ with SeamlessM4T->SeamlessM4Tv2, Decoder->TextToUnitDecoder
+    def __init__(
+        self,
+        config: SeamlessM4Tv2Config,
+        embed_tokens_decoder: Optional[nn.Embedding] = None,
+    ):
+        r"""
+        embed_tokens_decoder (`nn.Embedding`, *optional*):
+            input embedding of the decoder.
+        """
+        super().__init__(config)
+
+        self.encoder = SeamlessM4Tv2Encoder(config, is_t2u_encoder=True)
+        self.decoder = SeamlessM4Tv2TextToUnitDecoder(config, embed_tokens_decoder)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        char_input_ids: Optional[torch.LongTensor] = None,
+        char_count_per_id: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], Seq2SeqModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # decoder outputs consists of (dec_features, dec_hidden, dec_attn, padding_mask)
+        decoder_outputs = self.decoder(
+            char_input_ids=char_input_ids,
+            char_count_per_id=char_count_per_id,
+            encoder_hidden_states=encoder_outputs[0],
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return SeamlessM4Tv2TextToUnitOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            padding_mask=decoder_outputs.padding_mask,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Transformer text-to-unit encoder-decoder with a language model head. The base encoder-decoder model is a [`SeamlessM4Tv2TextToUnitModel`].
+    """
+)
+class SeamlessM4Tv2TextToUnitForConditionalGeneration(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
+    _keys_to_ignore_on_load_missing = [
+        "vocoder",
+        "speech_encoder",
+        "text_encoder",
+        "text_decoder",
+    ]
+    _tied_weights_keys = ["lm_head.weight"]
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.__init__ with SeamlessM4T->SeamlessM4Tv2
+    def __init__(
+        self,
+        config: SeamlessM4Tv2Config,
+        embed_tokens_decoder: Optional[nn.Embedding] = None,
+    ):
+        r"""
+        embed_tokens_decoder (`nn.Embedding`, *optional*):
+            input embedding of the decoder.
+        """
+        # update config - used principality for bos_token_id etc.
+        config = copy.deepcopy(config)
+        for param, val in config.to_dict().items():
+            if param.startswith("t2u_"):
+                config.__setattr__(param[4:], val)
+        super().__init__(config)
+
+        self.model = SeamlessM4Tv2TextToUnitModel(config, embed_tokens_decoder)
+
+        self.lm_head = nn.Linear(config.hidden_size, config.t2u_vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.get_encoder
+    def get_encoder(self):
+        return self.model.encoder
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.get_decoder
+    def get_decoder(self):
+        return self.model.decoder
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.model.decoder.embed_tokens
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.model.decoder.embed_tokens = value
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        char_input_ids: Optional[torch.LongTensor] = None,
+        char_count_per_id: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]:
+        r"""
+        char_input_ids (`torch.LongTensor` of shape `(batch_size, char_sequence_length)`):
+            Character indices. The correspondence between characters and indices can be found in `char_to_id`, a
+            dictionary in the generation configuration.
+        char_count_per_id (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Number of characters per input id.
+        inputs_embeds (`torch.FloatTensor` of shape`(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids,
+            char_input_ids=char_input_ids,
+            char_count_per_id=char_count_per_id,
+            attention_mask=attention_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        lm_logits = self.lm_head(outputs[0])
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            labels = labels.to(lm_logits.device)
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return SeamlessM4Tv2TextToUnitOutput(
+            last_hidden_state=lm_logits,
+            padding_mask=outputs.padding_mask,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            loss=masked_lm_loss,
+        )
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration._tie_weights
+    def _tie_weights(self) -> None:
+        if getattr(self.config, "tie_word_embeddings", True):
+            output_embeddings = self.get_output_embeddings()
+            if output_embeddings is not None:
+                self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())
+
+
+############ VOCODER related code ################
+
+
+# Copied from transformers.models.speecht5.modeling_speecht5.HifiGanResidualBlock
+class HifiGanResidualBlock(nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), leaky_relu_slope=0.1):
+        super().__init__()
+        self.leaky_relu_slope = leaky_relu_slope
+
+        self.convs1 = nn.ModuleList(
+            [
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=dilation[i],
+                    padding=self.get_padding(kernel_size, dilation[i]),
+                )
+                for i in range(len(dilation))
+            ]
+        )
+        self.convs2 = nn.ModuleList(
+            [
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=1,
+                    padding=self.get_padding(kernel_size, 1),
+                )
+                for _ in range(len(dilation))
+            ]
+        )
+
+    def get_padding(self, kernel_size, dilation=1):
+        return (kernel_size * dilation - dilation) // 2
+
+    def apply_weight_norm(self):
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        for layer in self.convs1:
+            weight_norm(layer)
+        for layer in self.convs2:
+            weight_norm(layer)
+
+    def remove_weight_norm(self):
+        for layer in self.convs1:
+            nn.utils.remove_weight_norm(layer)
+        for layer in self.convs2:
+            nn.utils.remove_weight_norm(layer)
+
+    def forward(self, hidden_states):
+        for conv1, conv2 in zip(self.convs1, self.convs2):
+            residual = hidden_states
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+            hidden_states = conv1(hidden_states)
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+            hidden_states = conv2(hidden_states)
+            hidden_states = hidden_states + residual
+        return hidden_states
+
+
+class SeamlessM4Tv2VariancePredictor(nn.Module):
+    def __init__(self, embed_dim, hidden_dim, kernel_size, var_pred_dropout):
+        super().__init__()
+
+        self.conv1 = nn.Conv1d(
+            embed_dim,
+            hidden_dim,
+            kernel_size=kernel_size,
+            padding="same",
+        )
+        self.activation_function = nn.ReLU()
+        self.ln1 = nn.LayerNorm(hidden_dim)
+        self.dropout_module = nn.Dropout(p=var_pred_dropout)
+        self.conv2 = nn.Conv1d(
+            hidden_dim,
+            hidden_dim,
+            kernel_size=kernel_size,
+            padding="same",
+        )
+        self.ln2 = nn.LayerNorm(hidden_dim)
+        self.proj = nn.Linear(hidden_dim, 1)
+
+    def forward(self, hidden_states: Tensor, padding_mask: Optional[Tensor] = None) -> Tensor:
+        # Input: B x T x C; Output: B x T
+        if padding_mask is not None:
+            hidden_states = hidden_states.masked_fill(~padding_mask.bool().unsqueeze(-1), 0.0)
+        hidden_states = self.conv1(hidden_states.transpose(1, 2))
+        hidden_states = self.activation_function(hidden_states).transpose(1, 2)
+        hidden_states = self.dropout_module(self.ln1(hidden_states))
+        if padding_mask is not None:
+            hidden_states = hidden_states.masked_fill(~padding_mask.bool().unsqueeze(-1), 0.0)
+        hidden_states = self.conv2(hidden_states.transpose(1, 2))
+        hidden_states = self.activation_function(hidden_states).transpose(1, 2)
+        hidden_states = self.dropout_module(self.ln2(hidden_states))
+        return self.proj(hidden_states).squeeze(dim=2)
+
+
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4THifiGan with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2HifiGan(nn.Module):
+    def __init__(self, config: SeamlessM4Tv2Config):
+        super().__init__()
+        model_in_dim = config.unit_embed_dim + config.lang_embed_dim + config.spkr_embed_dim
+        self.leaky_relu_slope = config.leaky_relu_slope
+        self.num_kernels = len(config.resblock_kernel_sizes)
+        self.num_upsamples = len(config.upsample_rates)
+        self.conv_pre = nn.Conv1d(
+            model_in_dim,
+            config.upsample_initial_channel,
+            kernel_size=7,
+            stride=1,
+            padding=3,
+        )
+
+        self.upsampler = nn.ModuleList()
+        for i, (upsample_rate, kernel_size) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)):
+            self.upsampler.append(
+                nn.ConvTranspose1d(
+                    config.upsample_initial_channel // (2**i),
+                    config.upsample_initial_channel // (2 ** (i + 1)),
+                    kernel_size=kernel_size,
+                    stride=upsample_rate,
+                    padding=(kernel_size - upsample_rate) // 2,
+                )
+            )
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.upsampler)):
+            channels = config.upsample_initial_channel // (2 ** (i + 1))
+            for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes):
+                self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation, config.leaky_relu_slope))
+
+        self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3)
+
+    def forward(self, input_embeds: torch.FloatTensor) -> torch.FloatTensor:
+        r"""
+        Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
+        of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech
+        waveform.
+
+        Args:
+            spectrogram (`torch.FloatTensor`):
+                Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
+                model_in_dim)`, or un-batched and of shape `(sequence_length, model_in_dim)`. Note that `model_in_dim`
+                is the sum of `config.unit_embed_dim`, `config.lang_embed_dim` and `config.spkr_embed_dim`.
+
+        Returns:
+            `torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of
+            shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`.
+        """
+
+        hidden_states = self.conv_pre(input_embeds)
+        for i in range(self.num_upsamples):
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+            hidden_states = self.upsampler[i](hidden_states)
+
+            res_state = self.resblocks[i * self.num_kernels](hidden_states)
+            for j in range(1, self.num_kernels):
+                res_state += self.resblocks[i * self.num_kernels + j](hidden_states)
+            hidden_states = res_state / self.num_kernels
+
+        hidden_states = nn.functional.leaky_relu(hidden_states)
+        hidden_states = self.conv_post(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+
+        # remove seq-len dim since this collapses to 1
+        waveform = hidden_states.squeeze(1)
+
+        return waveform
+
+
+@auto_docstring(
+    custom_intro="""
+    Code HiFi-GAN vocoder as described in this [repository](https://github.com/facebookresearch/speech-resynthesis).
+    """
+)
+class SeamlessM4Tv2CodeHifiGan(PreTrainedModel):
+    config: SeamlessM4Tv2Config
+    main_input_name = "input_embeds"
+    _no_split_modules = []
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.pad_token_id = config.t2u_pad_token_id
+        embed_dim = config.unit_embed_dim
+        kernel_size = config.variance_predictor_kernel_size
+        var_pred_dropout = config.var_pred_dropout
+        self.dur_predictor = SeamlessM4Tv2VariancePredictor(embed_dim, embed_dim, kernel_size, var_pred_dropout)
+
+        self.unit_embedding = nn.Embedding(config.unit_hifi_gan_vocab_size, config.unit_embed_dim)
+        self.speaker_embedding = nn.Embedding(config.vocoder_num_spkrs, config.spkr_embed_dim)
+        self.language_embedding = nn.Embedding(config.vocoder_num_langs, config.lang_embed_dim)
+
+        self.hifi_gan = SeamlessM4Tv2HifiGan(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan._get_dur_output_lengths
+    def _get_dur_output_lengths(self, input_ids, dur_out):
+        """
+        Computes the output length after the duration layer.
+        """
+        unit_lengths = (input_ids != self.pad_token_id).sum(1)
+
+        # take care of edge cases where no padding or too many padding
+        unit_lengths = torch.clamp(unit_lengths, 0, dur_out.shape[1] - 1)
+
+        cumulative_dur_out = torch.cumsum(dur_out, dim=1)
+        unit_lengths = cumulative_dur_out.gather(dim=1, index=unit_lengths.unsqueeze(1)).squeeze()
+
+        return unit_lengths
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan._get_output_hifigan_lengths
+    def _get_output_hifigan_lengths(self, input_lengths: Union[torch.LongTensor, int]):
+        """
+        Computes the output length of the hifigan convolutional layers
+        """
+
+        def _conv_out_length(input_length, kernel_size, stride, pad, dilation=1):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return (
+                torch.div(input_length + 2 * pad - dilation * (kernel_size - 1) - 1, stride, rounding_mode="floor") + 1
+            )
+
+        def _transpose_conv_out_length(input_length, kernel_size, stride, pad, dilation=1):
+            return (input_length - 1) * stride - 2 * pad + dilation * (kernel_size - 1) + 1
+
+        # conv_pre
+        input_lengths = _conv_out_length(input_lengths, 7, 1, 3)
+
+        # upsampler
+        for i, (upsample_rate, kernel_size) in enumerate(
+            zip(self.config.upsample_rates, self.config.upsample_kernel_sizes)
+        ):
+            input_lengths = _transpose_conv_out_length(
+                input_lengths, kernel_size, upsample_rate, (kernel_size - upsample_rate) // 2
+            )
+
+        # resblock
+        for i in range(len(self.config.upsample_rates)):
+            for kernel_size, dilation in zip(self.config.resblock_kernel_sizes, self.config.resblock_dilation_sizes):
+                for dil in dilation:
+                    input_lengths = _conv_out_length(
+                        input_lengths, kernel_size, 1, (kernel_size - 1) * dil // 2, dilation=dil
+                    )
+
+                for dil in dilation:
+                    input_lengths = _conv_out_length(input_lengths, kernel_size, 1, (kernel_size - 1) // 2, dilation=1)
+
+        # conv_post
+        input_lengths = _conv_out_length(input_lengths, 7, 1, 3)
+
+        return input_lengths
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan.forward with SeamlessM4T->SeamlessM4Tv2, spkr_id->speaker_id
+    def forward(
+        self, input_ids: torch.LongTensor, speaker_id: torch.Tensor, lang_id: torch.Tensor
+    ) -> tuple[torch.Tensor]:
+        """
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+
+                Indices can be obtained using [`SeamlessM4Tv2TextToUnitForConditionalGeneration`]. [What are input
+                IDs?](../glossary#input-ids)
+            speaker_id (`int`, *optional*):
+                The id of the speaker used for speech synthesis. Must be lower than `config.vocoder_num_spkrs`.
+            tgt_lang (`str`, *optional*):
+                The language id to use as target language for translation.
+        """
+        hidden_states = self.unit_embedding(input_ids).transpose(1, 2)
+        spkr = self.speaker_embedding(speaker_id).transpose(1, 2)
+        lang = self.language_embedding(lang_id).transpose(1, 2)
+
+        log_dur_pred = self.dur_predictor(hidden_states.transpose(1, 2))
+        dur_out = torch.clamp(torch.round(torch.expm1(log_dur_pred)).long(), min=1)
+        # B x C x T
+        if hidden_states.size(0) == 1:
+            hidden_states = torch.repeat_interleave(hidden_states, dur_out.view(-1), dim=2)
+        else:
+            # if batched sample, need to interleave per sample, and pad -> loss of parallelism
+            if hidden_states.shape[0] > 1 and self.training:
+                logger.warning(
+                    """`self.training=True` and you use batching. You lose parallelism during the hifigan
+                               forward pass because the samples are interleaved."""
+                )
+            hidden_states = [
+                torch.repeat_interleave(hidden_state, duration, dim=-1).transpose(0, 1)
+                for (hidden_state, duration) in zip(hidden_states, dur_out)
+            ]
+
+            hidden_states = nn.utils.rnn.pad_sequence(hidden_states, batch_first=True).transpose(1, 2)
+
+        spkr = spkr.repeat(1, 1, hidden_states.shape[-1])
+        lang = lang.repeat(1, 1, hidden_states.shape[-1])
+        hidden_states = torch.cat([lang, hidden_states, spkr], dim=1)
+
+        hidden_states = self.hifi_gan(hidden_states)
+
+        unit_lengths = self._get_dur_output_lengths(input_ids, dur_out)
+        lengths = self._get_output_hifigan_lengths(unit_lengths)
+
+        return hidden_states, lengths
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan._init_weights
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights."""
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv1d, nn.ConvTranspose1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.weight.data.fill_(1.0)
+            module.bias.data.zero_()
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan.apply_weight_norm
+    def apply_weight_norm(self):
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        weight_norm(self.hifi_gan.conv_pre)
+        for layer in self.hifi_gan.upsampler:
+            weight_norm(layer)
+        for layer in self.hifi_gan.resblocks:
+            layer.apply_weight_norm()
+        weight_norm(self.hifi_gan.conv_post)
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan.remove_weight_norm
+    def remove_weight_norm(self):
+        nn.utils.remove_weight_norm(self.hifi_gan.conv_pre)
+        for layer in self.hifi_gan.upsampler:
+            nn.utils.remove_weight_norm(layer)
+        for layer in self.hifi_gan.resblocks:
+            layer.remove_weight_norm()
+        nn.utils.remove_weight_norm(self.hifi_gan.conv_post)
+
+
+############ WHOLE MODEL related code ################
+
+
+@auto_docstring(
+    custom_intro="""
+    The text-to-text SeamlessM4Tv2 Model transformer which can be used for T2TT.
+    """
+)
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToText with SeamlessM4T->SeamlessM4Tv2,SeamlessM4Tv2Tokenizer->SeamlessM4TTokenizer, SeamlessM4Tv2Processor->SeamlessM4TProcessor, SEAMLESS_M4T->SEAMLESS_M4T_V2
+class SeamlessM4Tv2ForTextToText(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
+    _keys_to_ignore_on_load_missing = ["speech_encoder", "t2u_model", "vocoder"]
+    main_input_name = "input_ids"
+
+    _tied_weights_keys = [
+        "lm_head.weight",
+        "text_encoder.embed_tokens.weight",
+        "text_decoder.embed_tokens.weight",
+    ]
+
+    def __init__(self, config: SeamlessM4Tv2Config):
+        super().__init__(config)
+
+        self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+
+        self.text_encoder = SeamlessM4Tv2Encoder(config, self.shared)
+        self.text_decoder = SeamlessM4Tv2Decoder(config, self.shared)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.text_encoder
+
+    def get_decoder(self):
+        return self.text_decoder
+
+    def get_input_embeddings(self):
+        return self.text_decoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.text_encoder.embed_tokens = value
+        self.text_decoder.embed_tokens = value
+        self.shared = value
+
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.text_encoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.lm_head, self.shared)
+
+    @auto_docstring(custom_args=SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.text_encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        encoder_attention_mask = attention_mask
+
+        # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn)
+        decoder_outputs = self.text_decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        lm_logits = self.lm_head(decoder_outputs[0])
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            labels = labels.to(lm_logits.device)
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            outputs = decoder_outputs + encoder_outputs
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    def generate(
+        self,
+        input_ids=None,
+        tgt_lang=None,
+        generation_config=None,
+        logits_processor=None,
+        stopping_criteria=None,
+        prefix_allowed_tokens_fn=None,
+        synced_gpus=False,
+        **kwargs,
+    ):
+        """
+        Generates sequences of token ids.
+
+        <Tip warning={true}>
+
+        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
+        model's default generation configuration. You can override any `generation_config` by passing the corresponding
+        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](./generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            input_ids (`torch.Tensor` of varying shape depending on the modality, *optional*):
+                Indices of input sequence tokens in the vocabulary.
+
+                Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See
+                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            tgt_lang (`str`, *optional*):
+                The language to use as target language for translation.
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                Custom logits processors that complement the default logits processors built from arguments and
+                generation config. If a logit processor is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                Custom stopping criteria that complement the default stopping criteria built from arguments and a
+                generation config. If a stopping criteria is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], list[int]]`, *optional*):
+                If provided, this function constraints the beam search to allowed tokens only at each step. If not
+                provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
+                `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
+                on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
+                for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
+                Retrieval](https://huggingface.co/papers/2010.00904).
+            synced_gpus (`bool`, *optional*, defaults to `False`):
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
+            kwargs (`dict[str, Any]`, *optional*):
+                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
+                forwarded to the `forward` function of the model.
+
+        Return:
+            [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
+            or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`. The possible
+            [`~utils.ModelOutput`] types are:
+                - [`~generation.GenerateEncoderDecoderOutput`],
+                - [`~generation.GenerateBeamEncoderDecoderOutput`]
+        """
+        # prepare text_decoder_input_ids
+        text_decoder_input_ids = kwargs.pop("decoder_input_ids", None)
+        # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids.
+        if tgt_lang is not None:
+            batch_size = len(input_ids) if input_ids is not None else len(kwargs.get("inputs_embeds"))
+
+            if hasattr(self.generation_config, "text_decoder_lang_to_code_id"):
+                # also accept __xxx__
+                tgt_lang = tgt_lang.replace("__", "")
+                if tgt_lang not in self.generation_config.text_decoder_lang_to_code_id:
+                    raise ValueError(
+                        f"""`tgt_lang={tgt_lang}` is not supported by this model. Please specify a `tgt_lang` in
+                        {", ".join(self.generation_config.text_decoder_lang_to_code_id.keys())}"""
+                    )
+                # tgt_lang gets priority over decoder input ids
+                text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
+                text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device)
+            else:
+                raise ValueError(
+                    """This model generation config doesn't have a `text_decoder_lang_to_code_id` key which maps
+                    the target language to the right token id. Make sure to load the right generation config."""
+                )
+        else:
+            # only a warning, otherwise errors appear in the tests
+            logger.warning(
+                """You must either specify a `tgt_lang` or pass a correct `text_decoder_input_ids` to get
+                a correct generation, otherwise the generation will probably make no sense."""
+            )
+
+        return super().generate(
+            input_ids,
+            generation_config,
+            logits_processor,
+            stopping_criteria,
+            prefix_allowed_tokens_fn,
+            synced_gpus,
+            decoder_input_ids=text_decoder_input_ids,
+            **kwargs,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The speech-to-text SeamlessM4Tv2 Model transformer which can be used for S2TT.
+    """
+)
+class SeamlessM4Tv2ForSpeechToText(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
+    _keys_to_ignore_on_load_missing = ["text_encoder", "t2u_model", "vocoder"]
+    main_input_name = "input_features"
+
+    _tied_weights_keys = [
+        "lm_head.weight",
+        "text_decoder.embed_tokens.weight",
+    ]
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.__init__ with SeamlessM4T->SeamlessM4Tv2
+    def __init__(self, config: SeamlessM4Tv2Config):
+        super().__init__(config)
+
+        self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+        self.speech_encoder = SeamlessM4Tv2SpeechEncoder(config)
+        self.text_decoder = SeamlessM4Tv2Decoder(config, self.shared)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.get_encoder
+    def get_encoder(self):
+        return self.speech_encoder
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.get_decoder
+    def get_decoder(self):
+        return self.text_decoder
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.text_decoder.embed_tokens
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.text_decoder.embed_tokens = value
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText._tie_weights
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.lm_head, self.shared)
+
+    @auto_docstring(custom_args=SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS)
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.forward
+    def forward(
+        self,
+        input_features: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.speech_encoder(
+                input_features=input_features,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        encoder_attention_mask = attention_mask
+        if attention_mask is not None:
+            sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
+                encoder_outputs[0].device
+            )
+            encoder_attention_mask = _compute_new_attention_mask(
+                hidden_states=encoder_outputs[0], seq_lens=sub_sampled_lengths
+            )
+
+        # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn)
+        decoder_outputs = self.text_decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        lm_logits = self.lm_head(decoder_outputs[0])
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            labels = labels.to(lm_logits.device)
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            outputs = decoder_outputs + encoder_outputs
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.generate
+    def generate(
+        self,
+        input_features=None,
+        tgt_lang=None,
+        generation_config=None,
+        logits_processor=None,
+        stopping_criteria=None,
+        prefix_allowed_tokens_fn=None,
+        synced_gpus=False,
+        **kwargs,
+    ):
+        """
+        Generates sequences of token ids.
+
+        <Tip warning={true}>
+
+        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
+        model's default generation configuration. You can override any `generation_config` by passing the corresponding
+        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](./generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
+                Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
+                [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
+
+            tgt_lang (`str`, *optional*):
+                The language to use as target language for translation.
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                Custom logits processors that complement the default logits processors built from arguments and
+                generation config. If a logit processor is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                Custom stopping criteria that complement the default stopping criteria built from arguments and a
+                generation config. If a stopping criteria is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], list[int]]`, *optional*):
+                If provided, this function constraints the beam search to allowed tokens only at each step. If not
+                provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
+                `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
+                on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
+                for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
+                Retrieval](https://huggingface.co/papers/2010.00904).
+            synced_gpus (`bool`, *optional*, defaults to `False`):
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
+            kwargs (`dict[str, Any]`, *optional*):
+                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
+                forwarded to the `forward` function of the model.
+
+        Return:
+            [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
+            or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`. The possible
+            [`~utils.ModelOutput`] types are:
+                - [`~generation.GenerateEncoderDecoderOutput`],
+                - [`~generation.GenerateBeamEncoderDecoderOutput`]
+        """
+        text_decoder_input_ids = kwargs.pop("decoder_input_ids", None)
+        # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids.
+        input_features = input_features if input_features is not None else kwargs.pop("inputs")
+        if tgt_lang is not None:
+            inputs = kwargs.get("input_embeds") if input_features is None else input_features
+            inputs = (
+                inputs
+                if inputs is not None
+                else kwargs.get("encoder_outputs", {"last_hidden_state": None})["last_hidden_state"]
+            )
+            batch_size = len(inputs)
+
+            if hasattr(self.generation_config, "text_decoder_lang_to_code_id"):
+                # also accept __xxx__
+                tgt_lang = tgt_lang.replace("__", "")
+                if tgt_lang not in self.generation_config.text_decoder_lang_to_code_id:
+                    raise ValueError(
+                        f"""`tgt_lang={tgt_lang}` is not supported by this model. Please specify a `tgt_lang` in
+                        {", ".join(self.generation_config.text_decoder_lang_to_code_id.keys())}"""
+                    )
+                # tgt_lang gets priority over decoder input ids
+                text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
+                text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device)
+            else:
+                raise ValueError(
+                    """This model generation config doesn't have a `text_decoder_lang_to_code_id` key which maps
+                    the target language to the right token id. Make sure to load the right generation config."""
+                )
+        else:
+            # only a warning, otherwise errors appear in the tests
+            logger.warning(
+                """You must either specify a `tgt_lang` or pass a correct `text_decoder_input_ids` to get
+                a correct generation, otherwise the generation will probably make no sense."""
+            )
+        return super().generate(
+            input_features,
+            generation_config,
+            logits_processor,
+            stopping_criteria,
+            prefix_allowed_tokens_fn,
+            synced_gpus,
+            decoder_input_ids=text_decoder_input_ids,
+            **kwargs,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The text-to-speech SeamlessM4Tv2 Model transformer which can be used for T2ST.
+    """
+)
+class SeamlessM4Tv2ForTextToSpeech(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
+    _keys_to_ignore_on_load_missing = ["speech_encoder"]
+    main_input_name = "input_ids"
+
+    _tied_weights_keys = [
+        "lm_head.weight",
+        "text_encoder.embed_tokens.weight",
+        "text_decoder.embed_tokens.weight",
+    ]
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.__init__ with SeamlessM4T->SeamlessM4Tv2
+    def __init__(self, config: SeamlessM4Tv2Config):
+        super().__init__(config)
+
+        self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+
+        self.text_encoder = SeamlessM4Tv2Encoder(config, self.shared)
+        self.text_decoder = SeamlessM4Tv2Decoder(config, self.shared)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        self.t2u_model = SeamlessM4Tv2TextToUnitForConditionalGeneration(config)
+        self.vocoder = SeamlessM4Tv2CodeHifiGan(config)
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.get_encoder
+    def get_encoder(self):
+        return self.text_encoder
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.get_decoder
+    def get_decoder(self):
+        return self.text_decoder
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.text_decoder.embed_tokens
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.text_encoder.embed_tokens = value
+        self.text_decoder.embed_tokens = value
+        self.shared = value
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech._tie_weights
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.text_encoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.lm_head, self.shared)
+
+    @auto_docstring(custom_args=SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS)
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.forward with SeamlessM4T->SeamlessM4Tv2
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn
+            logger.warning(
+                "This is the same forward method as `SeamlessM4Tv2ForTextToText`."
+                "It doesn't use the text-to-unit model `SeamlessM4Tv2TextToUnitForConditionalGeneration`."
+                "If you want to generate speech, use the `.generate` method."
+            )
+            encoder_outputs = self.text_encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        encoder_attention_mask = attention_mask
+
+        # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn)
+        decoder_outputs = self.text_decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        lm_logits = self.lm_head(decoder_outputs[0])
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            labels = labels.to(lm_logits.device)
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            outputs = decoder_outputs + encoder_outputs
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        return_intermediate_token_ids: Optional[bool] = None,
+        tgt_lang: Optional[str] = None,
+        speaker_id: Optional[int] = 0,
+        **kwargs,
+    ) -> Union[torch.Tensor, SeamlessM4Tv2GenerationOutput]:
+        """
+        Generates translated audio waveforms.
+
+        <Tip>
+
+        This method successively calls the `.generate` function of two different sub-models. You can specify keyword
+        arguments at two different levels: general arguments that will be passed to both models, or prefixed arguments
+        that will be passed to one of them.
+
+        For example, calling `.generate(input_ids, num_beams=4, speech_do_sample=True)` will successively perform
+        beam-search decoding on the text model, and multinomial beam-search sampling on the speech model.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](./generation_strategies).
+
+        </Tip>
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+
+                Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See
+                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            return_intermediate_token_ids (`bool`, *optional*):
+                If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want
+                to get translated text alongside the audio.
+            tgt_lang (`str`, *optional*):
+                The language to use as target language for translation.
+            speaker_id (`int`, *optional*, defaults to 0):
+                The id of the speaker used for speech synthesis. Must be lower than `config.vocoder_num_spkrs`.
+            kwargs (*optional*):
+                Remaining dictionary of keyword arguments that will be passed to [`GenerationMixin.generate`]. Keyword
+                arguments are of two types:
+
+                    - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model,
+                    except for `decoder_input_ids` which will only be passed through the text components.
+                    - With a *text_* or *speech_* prefix, they will be input for the `generate` method of the
+                    text model and speech model respectively. It has the priority over the keywords without a prefix.
+
+                    This means you can, for example, specify a generation strategy for one generation but not for the
+                    other.
+
+
+        Returns:
+            `Union[SeamlessM4Tv2GenerationOutput, tuple[Tensor]]`:
+            - If `return_intermediate_token_ids`, returns [`SeamlessM4Tv2GenerationOutput`].
+            - If not `return_intermediate_token_ids`, returns a tuple composed of waveforms of shape `(batch_size,
+              sequence_length)` and `waveform_lengths` which gives the length of each sample.
+        """
+        batch_size = len(input_ids) if input_ids is not None else len(kwargs.get("inputs_embeds"))
+
+        if tgt_lang is None:
+            raise ValueError("You must specify a `tgt_lang` to generate translated speech.")
+        else:
+            # also accept __xxx__
+            tgt_lang = tgt_lang.replace("__", "")
+            for key in ["text_decoder_lang_to_code_id", "t2u_lang_code_to_id", "vocoder_lang_code_to_id"]:
+                lang_code_to_id = getattr(self.generation_config, key, None)
+                if lang_code_to_id is None:
+                    raise ValueError(
+                        f"""This model generation config doesn't have a `{key}` key which maps the target language
+                        to the right token id. Make sure to load the right generation config."""
+                    )
+                elif tgt_lang not in lang_code_to_id:
+                    raise ValueError(
+                        f"""`tgt_lang={tgt_lang}` is not supported by this model.
+                    Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4Tv2 supports
+                    more languages for text translation than for speech synthesis."""
+                    )
+
+        kwargs_text, kwargs_speech = format_speech_generation_kwargs(kwargs)
+        kwargs_text["output_hidden_states"] = True
+        kwargs_text["return_dict_in_generate"] = True
+        kwargs_text["output_scores"] = True
+
+        text_decoder_input_ids = kwargs_text.get("decoder_input_ids")
+
+        # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids.
+        text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
+        text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device)
+
+        kwargs_text["decoder_input_ids"] = text_decoder_input_ids
+
+        # first generation
+        text_generation_output = super().generate(input_ids, **kwargs_text)
+        sequences = text_generation_output.sequences
+
+        # prepare second generation
+        num_return_sequences = len(sequences) // batch_size
+        attention_mask = kwargs_speech.get("attention_mask", kwargs_text.get("attention_mask", None))
+
+        if attention_mask is not None:
+            # repeat attention mask alongside batch dimension
+            attention_mask = torch.repeat_interleave(attention_mask, num_return_sequences, dim=0)
+        encoder_hidden_states = text_generation_output.encoder_hidden_states[-1]
+
+        # repeat attention mask alongside batch dimension
+        encoder_hidden_states = torch.repeat_interleave(encoder_hidden_states, num_return_sequences, dim=0)
+
+        # get decoder last hidden state - must do a pass through the text decoder
+        t2u_input_embeds = self.text_decoder(
+            input_ids=sequences[:, :-1],  # Manually trim the final EOS token
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=attention_mask,
+        ).last_hidden_state
+
+        pad_token_id = self.generation_config.pad_token_id
+
+        # Compute new attention mask
+        seq_lens = (sequences[:, :-1] != pad_token_id).int().sum(1)
+        t2u_model_attention_mask = _compute_new_attention_mask(t2u_input_embeds, seq_lens)
+        kwargs_speech["attention_mask"] = t2u_model_attention_mask
+
+        # REMOVE EOS and lang_id
+        t2u_input_ids = sequences[:, 2:-1]
+        # replace every other EOS
+        t2u_input_ids = torch.masked_fill(
+            t2u_input_ids, t2u_input_ids == self.generation_config.eos_token_id, pad_token_id
+        )
+
+        # compute t2u_char_input_ids
+        t2u_subwords = self._indices_to_subwords(t2u_input_ids)
+        t2u_char_count_per_id = self._count_character_length_in_subword(
+            t2u_input_ids, t2u_subwords, pad_token_id=pad_token_id
+        )
+
+        # Add pads for lang, EOS tokens as per NLLB "source" tokenizer mode.
+        pad_zero = t2u_char_count_per_id.new_zeros((t2u_char_count_per_id.shape[0], 1))
+        t2u_char_count_per_id = torch.cat([pad_zero, t2u_char_count_per_id, pad_zero], dim=1)
+        t2u_char_input_ids = self._get_char_input_ids(
+            t2u_input_ids, t2u_subwords, t2u_char_count_per_id, pad_token_id=pad_token_id
+        )
+
+        # second pass
+        t2u_output = self.t2u_model(
+            inputs_embeds=t2u_input_embeds,
+            char_input_ids=t2u_char_input_ids,
+            char_count_per_id=t2u_char_count_per_id,
+            **kwargs_speech,
+        )
+
+        t2u_logits = t2u_output[0]
+        padding_mask = t2u_output[1].bool()
+
+        # The text-to-unit model is non auto-regressive. We keep the ability to use sampling with temperature
+        temperature = kwargs_speech.get("temperature", None)
+        if (temperature is None or temperature == 1.0) or not kwargs_speech.get("do_sample", False):
+            unit_ids = t2u_logits.argmax(dim=-1)
+        else:
+            t2u_logits = t2u_logits / temperature
+            # apply softmax
+            probs = nn.functional.softmax(t2u_logits, dim=-1)
+            # reshape to 2D: (batch_size, seq_len, t2u_vocab_size) -> (batch_size*seq_len, t2u_vocab_size)
+            probs = probs.reshape((-1, probs.shape[2]))
+            # multinomial then reshape : (batch_size*seq_len)-> (batch_size,seq_len)
+            unit_ids = torch.multinomial(probs, num_samples=1).view(t2u_logits.shape[0], -1)
+
+        output_unit_ids = unit_ids.detach().clone()
+
+        replace_mask = (unit_ids == self.config.t2u_eos_token_id) | (~padding_mask)
+        # replace eos per pad
+        unit_ids = unit_ids.masked_fill(replace_mask, self.config.t2u_pad_token_id)
+
+        # offset of control symbols
+        unit_ids = torch.where(
+            unit_ids == self.config.t2u_pad_token_id, unit_ids, unit_ids - self.config.vocoder_offset
+        )
+
+        vocoder_tgt_lang_id = self.generation_config.vocoder_lang_code_to_id.get(tgt_lang)
+        vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids), device=self.device)
+
+        speaker_id = torch.tensor([[speaker_id]] * len(unit_ids), device=self.device)
+
+        waveform, waveform_lengths = self.vocoder(
+            input_ids=unit_ids, speaker_id=speaker_id, lang_id=vocoder_tgt_lang_id
+        )
+
+        if return_intermediate_token_ids:
+            return SeamlessM4Tv2GenerationOutput(
+                waveform=waveform,
+                waveform_lengths=waveform_lengths,
+                sequences=sequences,
+                unit_sequences=output_unit_ids,
+            )
+
+        return waveform, waveform_lengths
+
+
+@auto_docstring(
+    custom_intro="""
+    The speech-to-speech SeamlessM4Tv2 Model transformer which can be used for S2ST.
+    """
+)
+class SeamlessM4Tv2ForSpeechToSpeech(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
+    _keys_to_ignore_on_load_missing = ["text_encoder"]
+    main_input_name = "input_features"
+
+    _tied_weights_keys = [
+        "lm_head.weight",
+        "text_decoder.embed_tokens.weight",
+    ]
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.__init__ with SeamlessM4T->SeamlessM4Tv2
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+        self.speech_encoder = SeamlessM4Tv2SpeechEncoder(config)
+        self.text_decoder = SeamlessM4Tv2Decoder(config, self.shared)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        self.t2u_model = SeamlessM4Tv2TextToUnitForConditionalGeneration(config)
+        self.vocoder = SeamlessM4Tv2CodeHifiGan(config)
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.get_encoder
+    def get_encoder(self):
+        return self.speech_encoder
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.get_decoder
+    def get_decoder(self):
+        return self.text_decoder
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.text_decoder.embed_tokens
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.text_decoder.embed_tokens = value
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech._tie_weights
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.lm_head, self.shared)
+
+    @auto_docstring(custom_args=SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS)
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.forward with SeamlessM4T->SeamlessM4Tv2
+    def forward(
+        self,
+        input_features: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn
+            logger.warning(
+                "This is the same forward method as `SeamlessM4Tv2ForSpeechToText`. It doesn't use `self.t2u_model`."
+                "If you want to generate speech, use the `generate` method."
+            )
+
+            encoder_outputs = self.speech_encoder(
+                input_features=input_features,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        encoder_attention_mask = attention_mask
+        if attention_mask is not None:
+            sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
+                encoder_outputs[0].device
+            )
+            encoder_attention_mask = _compute_new_attention_mask(
+                hidden_states=encoder_outputs[0], seq_lens=sub_sampled_lengths
+            )
+
+        # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn)
+        decoder_outputs = self.text_decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        lm_logits = self.lm_head(decoder_outputs[0])
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            labels = labels.to(lm_logits.device)
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            outputs = decoder_outputs + encoder_outputs
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        input_features: Optional[torch.Tensor] = None,
+        return_intermediate_token_ids: Optional[bool] = None,
+        tgt_lang: Optional[str] = None,
+        speaker_id: Optional[int] = 0,
+        **kwargs,
+    ) -> Union[torch.Tensor, SeamlessM4Tv2GenerationOutput]:
+        """
+        Generates translated audio waveforms.
+
+        <Tip>
+
+        This method successively calls the `.generate` function of two different sub-models. You can specify keyword
+        arguments at two different levels: general arguments that will be passed to both models, or prefixed arguments
+        that will be passed to one of them.
+
+        For example, calling `.generate(input_features, num_beams=4, speech_do_sample=True)` will successively perform
+        beam-search decoding on the text model, and multinomial beam-search sampling on the speech model.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](./generation_strategies).
+
+        </Tip>
+
+        Args:
+            input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
+                Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
+                [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
+            return_intermediate_token_ids (`bool`, *optional*):
+                If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want
+                to get translated text alongside the audio.
+            tgt_lang (`str`, *optional*):
+                The language to use as target language for translation.
+            speaker_id (`int`, *optional*, defaults to 0):
+                The id of the speaker used for speech synthesis. Must be lower than `config.vocoder_num_spkrs`.
+
+            kwargs (*optional*):
+                Remaining dictionary of keyword arguments that will be passed to [`GenerationMixin.generate`]. Keyword
+                arguments are of two types:
+
+                    - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model,
+                    except for `decoder_input_ids` which will only be passed through the text components.
+                    - With a *text_* or *speech_* prefix, they will be input for the `generate` method of the
+                    text model and speech model respectively. It has the priority over the keywords without a prefix.
+
+                    This means you can, for example, specify a generation strategy for one generation but not for the
+                    other.
+
+
+        Returns:
+            `Union[SeamlessM4Tv2GenerationOutput, tuple[Tensor]]`:
+            - If `return_intermediate_token_ids`, returns [`SeamlessM4Tv2GenerationOutput`].
+            - If not `return_intermediate_token_ids`, returns a tuple composed of waveforms of shape `(batch_size,
+              sequence_length)` and `waveform_lengths` which gives the length of each sample.
+        """
+        batch_size = len(input_features) if input_features is not None else len(kwargs.get("inputs_embeds"))
+
+        if tgt_lang is None:
+            raise ValueError("You must specify a `tgt_lang` to generate translated speech.")
+        else:
+            # also accept __xxx__
+            tgt_lang = tgt_lang.replace("__", "")
+            for key in ["text_decoder_lang_to_code_id", "t2u_lang_code_to_id", "vocoder_lang_code_to_id"]:
+                lang_code_to_id = getattr(self.generation_config, key, None)
+                if lang_code_to_id is None:
+                    raise ValueError(
+                        f"""This model generation config doesn't have a `{key}` key which maps the target language
+                        to the right token id. Make sure to load the right generation config."""
+                    )
+                elif tgt_lang not in lang_code_to_id:
+                    raise ValueError(
+                        f"""`tgt_lang={tgt_lang}` is not supported by this model.
+                    Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4Tv2 supports
+                    more languages for text translation than for speech synthesis."""
+                    )
+
+        kwargs_text, kwargs_speech = format_speech_generation_kwargs(kwargs)
+        kwargs_text["output_hidden_states"] = True
+        kwargs_text["return_dict_in_generate"] = True
+        kwargs_text["output_scores"] = True
+
+        text_decoder_input_ids = kwargs_text.get("decoder_input_ids")
+        # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids.
+        text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
+        text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device)
+
+        kwargs_text["decoder_input_ids"] = text_decoder_input_ids
+
+        # first generation
+        text_generation_output = super().generate(input_features, **kwargs_text)
+        sequences = text_generation_output.sequences
+
+        # prepare second generation
+        num_return_sequences = len(sequences) // batch_size
+        attention_mask = kwargs_speech.get("attention_mask", kwargs_text.get("attention_mask", None))
+
+        # get last_hidden_state from encoder
+        encoder_hidden_states = self.speech_encoder(input_features=input_features, attention_mask=attention_mask)[0]
+
+        # input modality = speech so new attention mask for the decoder
+        if attention_mask is not None:
+            sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
+                encoder_hidden_states.device
+            )
+            attention_mask = _compute_new_attention_mask(
+                hidden_states=encoder_hidden_states, seq_lens=sub_sampled_lengths
+            )
+
+            # repeat attention mask alongside batch dimension
+            attention_mask = torch.repeat_interleave(attention_mask, num_return_sequences, dim=0)
+
+        # repeat attention mask alongside batch dimension
+        encoder_hidden_states = torch.repeat_interleave(encoder_hidden_states, num_return_sequences, dim=0)
+
+        # get decoder last hidden state - must do a pass through the text decoder
+        t2u_input_embeds = self.text_decoder(
+            input_ids=sequences[:, :-1],  # Manually trim the final EOS token
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=attention_mask,
+        ).last_hidden_state
+
+        pad_token_id = self.generation_config.pad_token_id
+
+        # Compute new attention mask
+        seq_lens = (sequences[:, :-1] != pad_token_id).int().sum(1)
+        t2u_model_attention_mask = _compute_new_attention_mask(t2u_input_embeds, seq_lens)
+        kwargs_speech["attention_mask"] = t2u_model_attention_mask
+
+        # REMOVE EOS and lang_id
+        t2u_input_ids = sequences[:, 2:-1]
+        # replace every other EOS
+        t2u_input_ids = torch.masked_fill(
+            t2u_input_ids, t2u_input_ids == self.generation_config.eos_token_id, pad_token_id
+        )
+
+        # compute t2u_char_input_ids
+        t2u_subwords = self._indices_to_subwords(t2u_input_ids)
+        t2u_char_count_per_id = self._count_character_length_in_subword(
+            t2u_input_ids, t2u_subwords, pad_token_id=pad_token_id
+        )
+
+        # Add pads for lang, EOS tokens as per NLLB "source" tokenizer mode.
+        pad_zero = t2u_char_count_per_id.new_zeros((t2u_char_count_per_id.shape[0], 1))
+        t2u_char_count_per_id = torch.cat([pad_zero, t2u_char_count_per_id, pad_zero], dim=1)
+        t2u_char_input_ids = self._get_char_input_ids(
+            t2u_input_ids, t2u_subwords, t2u_char_count_per_id, pad_token_id=pad_token_id
+        )
+
+        # second pass
+        t2u_output = self.t2u_model(
+            inputs_embeds=t2u_input_embeds,
+            char_input_ids=t2u_char_input_ids,
+            char_count_per_id=t2u_char_count_per_id,
+            **kwargs_speech,
+        )
+
+        t2u_logits = t2u_output[0]
+        padding_mask = t2u_output[1].bool()
+
+        # The text-to-unit model is non auto-regressive. We keep the ability to use sampling with temperature
+        temperature = kwargs_speech.get("temperature", None)
+        if (temperature is None or temperature == 1.0) or not kwargs_speech.get("do_sample", False):
+            unit_ids = t2u_logits.argmax(dim=-1)
+        else:
+            t2u_logits = t2u_logits / temperature
+            # apply softmax
+            probs = nn.functional.softmax(t2u_logits, dim=-1)
+            # reshape to 2D: (batch_size, seq_len, t2u_vocab_size) -> (batch_size*seq_len, t2u_vocab_size)
+            probs = probs.reshape((-1, probs.shape[2]))
+            # multinomial then reshape : (batch_size*seq_len)-> (batch_size,seq_len)
+            unit_ids = torch.multinomial(probs, num_samples=1).view(t2u_logits.shape[0], -1)
+
+        output_unit_ids = unit_ids.detach().clone()
+
+        replace_mask = (unit_ids == self.config.t2u_eos_token_id) | (~padding_mask)
+        # replace eos per pad
+        unit_ids = unit_ids.masked_fill(replace_mask, self.config.t2u_pad_token_id)
+
+        # offset of control symbols
+        unit_ids = torch.where(
+            unit_ids == self.config.t2u_pad_token_id, unit_ids, unit_ids - self.config.vocoder_offset
+        )
+
+        vocoder_tgt_lang_id = self.generation_config.vocoder_lang_code_to_id.get(tgt_lang)
+        vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids), device=self.device)
+
+        speaker_id = torch.tensor([[speaker_id]] * len(unit_ids), device=self.device)
+
+        waveform, waveform_lengths = self.vocoder(
+            input_ids=unit_ids, speaker_id=speaker_id, lang_id=vocoder_tgt_lang_id
+        )
+
+        if return_intermediate_token_ids:
+            return SeamlessM4Tv2GenerationOutput(
+                waveform=waveform,
+                waveform_lengths=waveform_lengths,
+                sequences=sequences,
+                unit_sequences=output_unit_ids,
+            )
+
+        return waveform, waveform_lengths
+
+
+@auto_docstring(
+    custom_intro="""
+    The original SeamlessM4Tv2 Model transformer which can be used for every tasks available (S2ST, S2TT, T2TT, T2ST).
+    """
+)
+class SeamlessM4Tv2Model(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = [
+        "lm_head.weight",
+        "text_encoder.embed_tokens.weight",
+        "text_decoder.embed_tokens.weight",
+    ]
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.__init__ with SeamlessM4T->SeamlessM4Tv2
+    def __init__(self, config, current_modality="text"):
+        r"""
+        current_modality (`str`, *optional*, defaults to `"text"`):
+            Default modality. Used to initialize the model.
+        """
+        super().__init__(config)
+
+        self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+
+        self.text_encoder = SeamlessM4Tv2Encoder(config, self.shared)
+        self.speech_encoder = SeamlessM4Tv2SpeechEncoder(config)
+        self.text_decoder = SeamlessM4Tv2Decoder(config, self.shared)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        self.current_modality = current_modality
+        if current_modality == "speech":
+            self.main_input_name = "input_features"
+
+        # these models already call post_init in their initialization
+        self.t2u_model = SeamlessM4Tv2TextToUnitForConditionalGeneration(config)
+        self.vocoder = SeamlessM4Tv2CodeHifiGan(config)
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.set_modality
+    def set_modality(self, modality="text"):
+        if modality == "text":
+            self.main_input_name = "input_ids"
+            self.current_modality = "text"
+        elif modality == "speech":
+            self.main_input_name = "input_features"
+            self.current_modality = "speech"
+        else:
+            raise ValueError(f"`modality={modality}` is not a valid modality. It must be `text` or `speech`.")
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.get_encoder
+    def get_encoder(self):
+        if self.current_modality == "text":
+            return self.text_encoder
+        else:
+            return self.speech_encoder
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.text_decoder.embed_tokens
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.text_encoder.embed_tokens = value
+        self.text_decoder.embed_tokens = value
+        self.shared = value
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel._tie_weights
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.text_encoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.lm_head, self.shared)
+
+    @auto_docstring(custom_args=SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS)
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.forward with SeamlessM4T->SeamlessM4Tv2
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        input_features: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        if input_ids is None and input_features is None and inputs_embeds is None and encoder_outputs is None:
+            raise ValueError(
+                "`input_ids`,`input_features`, `inputs_embeds` and `encoder_outputs` are all empty. Make sure at least one of them is not."
+            )
+        elif input_features is not None:
+            if input_ids is not None:
+                logger.warning(
+                    "`input_ids` is not `None` but `input_features` has been given."
+                    "`input_features` will be used in priority through the `speech_encoder`. "
+                    "Make sure that `input_features` and `input_ids` are mutually exclusive."
+                )
+
+            if inputs_embeds is not None:
+                logger.warning(
+                    "`inputs_embeds` is not `None` but `input_features` has been given."
+                    "`input_features` will be used in priority through `speech_encoder`. "
+                    "`inputs_embeds` will be ignored."
+                )
+
+            # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn
+            logger.warning(
+                "This calls the same method `forward` as `SeamlessM4Tv2ForTextToText` and `SeamlessM4Tv2ForSpeechToText`"
+                "depending on the input modality. If you want to generate speech, use the `generate` method."
+            )
+
+            self.set_modality("speech")
+
+            encoder_outputs = self.speech_encoder(
+                input_features=input_features,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+
+        elif input_ids is not None or inputs_embeds is not None:
+            # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn
+            logger.warning(
+                "This calls the same method `forward` as `SeamlessM4Tv2ForTextToText` and `SeamlessM4Tv2ForSpeechToText`"
+                "depending on the input modality. If you want to generate speech, use the `generate` method."
+            )
+            self.set_modality("text")
+            encoder_outputs = self.text_encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        encoder_attention_mask = attention_mask
+        # input modality = speech so new attention mask
+        if self.current_modality == "speech" and attention_mask is not None:
+            sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
+                encoder_outputs[0].device
+            )
+            encoder_attention_mask = _compute_new_attention_mask(
+                hidden_states=encoder_outputs[0], seq_lens=sub_sampled_lengths
+            )
+
+        # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn)
+        decoder_outputs = self.text_decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        lm_logits = self.lm_head(decoder_outputs[0])
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            labels = labels.to(lm_logits.device)
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            outputs = decoder_outputs + encoder_outputs
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        input_features: Optional[torch.Tensor] = None,
+        return_intermediate_token_ids: Optional[bool] = None,
+        tgt_lang: Optional[str] = None,
+        speaker_id: Optional[int] = 0,
+        generate_speech: Optional[bool] = True,
+        **kwargs,
+    ) -> Union[torch.Tensor, SeamlessM4Tv2GenerationOutput]:
+        """
+        Generates translated token ids and/or translated audio waveforms.
+
+        <Tip>
+
+        This method successively calls the `.generate` function of two different sub-models. You can specify keyword
+        arguments at two different levels: general arguments that will be passed to both models, or prefixed arguments
+        that will be passed to one of them.
+
+        For example, calling `.generate(input_ids=input_ids, num_beams=4, speech_do_sample=True)` will successively
+        perform beam-search decoding on the text model, and multinomial beam-search sampling on the speech model.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](./generation_strategies).
+
+        </Tip>
+
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indices of input sequence tokens in the vocabulary.
+
+                Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See
+                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`, *optional*):
+                Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
+                [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
+            return_intermediate_token_ids (`bool`, *optional*):
+                If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want
+                to get translated text alongside the audio.
+                Note that if `generate_speech=False`, this parameter will be ignored and
+                the text tokens are returned.
+            tgt_lang (`str`, *optional*):
+                The language to use as target language for translation.
+            speaker_id (`int`, *optional*, defaults to 0):
+                The id of the speaker used for speech synthesis. Must be lower than `config.vocoder_num_spkrs`.
+            generate_speech (`bool`, *optional*, defaults to `True`):
+                If `False`, will only returns the text tokens and won't generate speech.
+
+            kwargs (*optional*):
+                Remaining dictioy of keyword arguments that will be passed to [`GenerationMixin.generate`]. Keyword
+                arguments are of two types:
+
+                    - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model,
+                    except for `decoder_input_ids` which will only be passed through the text components.
+                    - With a *text_* or *speech_* prefix, they will be input for the `generate` method of the
+                    text model and speech model respectively. It has the priority over the keywords without a prefix.
+
+                    This means you can, for example, specify a generation strategy for one generation but not for the
+                    other.
+
+        Returns:
+            `Union[SeamlessM4Tv2GenerationOutput, tuple[Tensor], ModelOutput]`:
+            - If `generate_speech` and `return_intermediate_token_ids`, returns [`SeamlessM4Tv2GenerationOutput`].
+            - If `generate_speech` and not `return_intermediate_token_ids`, returns a tuple composed of waveforms of
+              shape `(batch_size, sequence_length)` and `waveform_lengths` which gives the length of each sample.
+            - If `generate_speech=False`, it will returns `ModelOutput`.
+        """
+        if input_ids is None and input_features is None and kwargs.get("inputs_embeds") is None:
+            raise ValueError(
+                "`input_ids`,`input_features` and `inputs_embeds` are all empty. Make sure at least one of them is not."
+            )
+
+        if generate_speech and tgt_lang is None:
+            raise ValueError("You must specify a `tgt_lang` to generate translated speech.")
+
+        if tgt_lang is not None:
+            # also accept __xxx__
+            tgt_lang = tgt_lang.replace("__", "")
+            if generate_speech:
+                keys_to_check = ["text_decoder_lang_to_code_id", "t2u_lang_code_to_id", "vocoder_lang_code_to_id"]
+            else:
+                keys_to_check = ["text_decoder_lang_to_code_id"]
+            for key in keys_to_check:
+                lang_code_to_id = getattr(self.generation_config, key, None)
+                if lang_code_to_id is None:
+                    raise ValueError(
+                        f"""This model generation config doesn't have a `{key}` key which maps the target language
+                        to the right token id. Make sure to load the right generation config."""
+                    )
+                elif tgt_lang not in lang_code_to_id:
+                    raise ValueError(
+                        f"""`tgt_lang={tgt_lang}` is not supported by this model.
+                    Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4Tv2 supports
+                    more languages for text translation than for speech synthesis."""
+                    )
+
+        batch_size = (
+            len(input_features)
+            if input_features is not None
+            else (len(input_ids) if input_ids is not None else len(kwargs.get("inputs_embeds")))
+        )
+
+        kwargs_text, kwargs_speech = format_speech_generation_kwargs(kwargs)
+        kwargs_text["output_hidden_states"] = True
+        kwargs_text["return_dict_in_generate"] = True
+        kwargs_text["output_scores"] = True
+
+        text_decoder_input_ids = kwargs_text.get("decoder_input_ids")
+        # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids.
+        if tgt_lang is not None:
+            # tgt_lang gets priority over decoder input ids
+            text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
+            text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device)
+
+        kwargs_text["decoder_input_ids"] = text_decoder_input_ids
+
+        # first generation
+        if input_features is not None:
+            self.set_modality("speech")
+            if input_ids is not None:
+                logger.warning(
+                    "`input_features` and `input_ids` are both non empty. `input_features` will be used in priority "
+                    "through the speech encoder. Make sure `input_features=None` if you want to use the text encoder."
+                )
+            text_generation_output = super().generate(input_features=input_features, **kwargs_text)
+        else:
+            self.set_modality("text")
+            text_generation_output = super().generate(input_ids=input_ids, input_features=None, **kwargs_text)
+        sequences = text_generation_output.sequences
+
+        if not generate_speech:
+            return text_generation_output
+
+        # prepare second generation
+        num_return_sequences = len(sequences) // batch_size
+        attention_mask = kwargs_speech.get("attention_mask", kwargs_text.get("attention_mask", None))
+
+        # get encoder last hidden states
+        if self.current_modality == "speech":
+            # get last_hidden_state from encoder - must do a pass through the speech encoder
+            encoder_hidden_states = self.speech_encoder(
+                input_features=input_features, attention_mask=attention_mask
+            ).last_hidden_state
+
+            # input modality = speech so new attention mask for the decoder
+            if attention_mask is not None:
+                sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
+                    encoder_hidden_states.device
+                )
+                attention_mask = _compute_new_attention_mask(
+                    hidden_states=encoder_hidden_states, seq_lens=sub_sampled_lengths
+                )
+        else:
+            encoder_hidden_states = text_generation_output.encoder_hidden_states[-1]
+
+        if attention_mask is not None:
+            # repeat attention mask alongside batch dimension
+            attention_mask = torch.repeat_interleave(attention_mask, num_return_sequences, dim=0)
+
+        # repeat attention mask alongside batch dimension
+        encoder_hidden_states = torch.repeat_interleave(encoder_hidden_states, num_return_sequences, dim=0)
+
+        # get decoder last hidden state - must do a pass through the text decoder
+        t2u_input_embeds = self.text_decoder(
+            input_ids=sequences[:, :-1],  # Manually trim the final EOS token
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=attention_mask,
+        ).last_hidden_state
+
+        pad_token_id = self.generation_config.pad_token_id
+
+        # Compute new attention mask
+        seq_lens = (sequences[:, :-1] != pad_token_id).int().sum(1)
+        t2u_model_attention_mask = _compute_new_attention_mask(t2u_input_embeds, seq_lens)
+        kwargs_speech["attention_mask"] = t2u_model_attention_mask
+
+        # REMOVE EOS and lang_id
+        t2u_input_ids = sequences[:, 2:-1]
+        # replace every other EOS
+        t2u_input_ids = torch.masked_fill(
+            t2u_input_ids, t2u_input_ids == self.generation_config.eos_token_id, pad_token_id
+        )
+
+        # compute t2u_char_input_ids
+        t2u_subwords = self._indices_to_subwords(t2u_input_ids)
+        t2u_char_count_per_id = self._count_character_length_in_subword(
+            t2u_input_ids, t2u_subwords, pad_token_id=pad_token_id
+        )
+
+        # Add pads for lang, EOS tokens as per NLLB "source" tokenizer mode.
+        pad_zero = t2u_char_count_per_id.new_zeros((t2u_char_count_per_id.shape[0], 1))
+        t2u_char_count_per_id = torch.cat([pad_zero, t2u_char_count_per_id, pad_zero], dim=1)
+        t2u_char_input_ids = self._get_char_input_ids(
+            t2u_input_ids, t2u_subwords, t2u_char_count_per_id, pad_token_id=pad_token_id
+        )
+
+        # second pass
+        t2u_output = self.t2u_model(
+            inputs_embeds=t2u_input_embeds,
+            char_input_ids=t2u_char_input_ids,
+            char_count_per_id=t2u_char_count_per_id,
+            **kwargs_speech,
+        )
+
+        t2u_logits = t2u_output[0]
+        padding_mask = t2u_output[1].bool()
+
+        # The text-to-unit model is non auto-regressive. We keep the ability to use sampling with temperature
+        temperature = kwargs_speech.get("temperature", None)
+        if (temperature is None or temperature == 1.0) or not kwargs_speech.get("do_sample", False):
+            unit_ids = t2u_logits.argmax(dim=-1)
+        else:
+            t2u_logits = t2u_logits / temperature
+            # apply softmax
+            probs = nn.functional.softmax(t2u_logits, dim=-1)
+            # reshape to 2D: (batch_size, seq_len, t2u_vocab_size) -> (batch_size*seq_len, t2u_vocab_size)
+            probs = probs.reshape((-1, probs.shape[2]))
+            # multinomial then reshape : (batch_size*seq_len)-> (batch_size,seq_len)
+            unit_ids = torch.multinomial(probs, num_samples=1).view(t2u_logits.shape[0], -1)
+
+        output_unit_ids = unit_ids.detach().clone()
+
+        replace_mask = (unit_ids == self.config.t2u_eos_token_id) | (~padding_mask)
+        # replace eos per pad
+        unit_ids = unit_ids.masked_fill(replace_mask, self.config.t2u_pad_token_id)
+
+        # offset of control symbols
+        unit_ids = torch.where(
+            unit_ids == self.config.t2u_pad_token_id, unit_ids, unit_ids - self.config.vocoder_offset
+        )
+
+        vocoder_tgt_lang_id = self.generation_config.vocoder_lang_code_to_id.get(tgt_lang)
+        vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids), device=self.device)
+
+        speaker_id = torch.tensor([[speaker_id]] * len(unit_ids), device=self.device)
+
+        waveform, waveform_lengths = self.vocoder(
+            input_ids=unit_ids, speaker_id=speaker_id, lang_id=vocoder_tgt_lang_id
+        )
+
+        if return_intermediate_token_ids:
+            return SeamlessM4Tv2GenerationOutput(
+                waveform=waveform,
+                waveform_lengths=waveform_lengths,
+                sequences=sequences,
+                unit_sequences=output_unit_ids,
+            )
+
+        return waveform, waveform_lengths
+
+
+__all__ = [
+    "SeamlessM4Tv2ForTextToSpeech",
+    "SeamlessM4Tv2ForSpeechToSpeech",
+    "SeamlessM4Tv2ForTextToText",
+    "SeamlessM4Tv2ForSpeechToText",
+    "SeamlessM4Tv2Model",
+    "SeamlessM4Tv2PreTrainedModel",
+]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/voxtral/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/voxtral/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa7fc1b411d35ab272c3217221eead1c0082e7cf
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/voxtral/__init__.py
@@ -0,0 +1,29 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_voxtral import *
+    from .modeling_voxtral import *
+    from .processing_voxtral import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/voxtral/configuration_voxtral.py b/venv/lib/python3.13/site-packages/transformers/models/voxtral/configuration_voxtral.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cdd499cdeaae9d1e28d8c8981f7261e7d09564a
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/voxtral/configuration_voxtral.py
@@ -0,0 +1,201 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...configuration_utils import PretrainedConfig
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+class VoxtralEncoderConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`VoxtralEncoder`]. It is used to instantiate a
+    Voxtral audio encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the audio encoder of the Voxtral
+    architecture.
+
+    e.g. [mistralai/Voxtral-Mini-3B-2507](https://huggingface.co/mistralai/Voxtral-Mini-3B-2507)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 51866):
+            Vocabulary size of the model.
+        hidden_size (`int`, *optional*, defaults to 1280):
+            Dimensionality of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 5120):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 20):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        scale_embedding (`bool`, *optional*, defaults to `False`):
+            Scale embeddings by dividing by sqrt(hidden_size) if True.
+        activation_function (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu",
+        num_mel_bins (`int`, *optional*, defaults to 128):
+            Number of mel features used per input features. Should correspond to the value used in the
+            `VoxtralProcessor` class.
+        max_source_positions (`int`, *optional*, defaults to 1500):
+            The maximum sequence length of log-mel filter-bank features that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    ```python
+    >>> from transformers import VoxtralEncoderConfig, VoxtralEncoder
+
+    >>> # Initializing a VoxtralEncoderConfig
+    >>> configuration = VoxtralEncoderConfig()
+
+    >>> # Initializing a VoxtralEncoder (with random weights)
+    >>> model = VoxtralEncoder(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "voxtral_encoder"
+
+    attribute_map = {
+        "d_model": "hidden_size",
+        "encoder_layers": "num_hidden_layers",
+        "encoder_attention_heads": "num_attention_heads",
+        "encoder_ffn_dim": "intermediate_size",
+        "encoder_layerdrop": "layerdrop",
+    }
+
+    def __init__(
+        self,
+        vocab_size=51866,
+        hidden_size=1280,
+        intermediate_size=5120,
+        num_hidden_layers=32,
+        num_attention_heads=20,
+        scale_embedding=False,
+        activation_function="gelu",
+        num_mel_bins=128,
+        max_source_positions=1500,
+        initializer_range=0.02,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+
+        self.num_attention_heads = num_attention_heads
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(hidden_size) if True
+        self.activation_function = activation_function
+        self.num_mel_bins = num_mel_bins
+        self.max_source_positions = max_source_positions
+        self.initializer_range = initializer_range
+
+        # TODO: @eustlb, we do not use dropout and layerdrop, yet we need to hardcode them
+        # to be able to use Whisper with modular (here actually from Qwen2-Audio and copied from).
+        # After a future Whisper refactor, we should remove this.
+        self.dropout = 0.0
+        self.layerdrop = 0.0
+        self.activation_dropout = 0.0
+
+        self.attention_dropout = attention_dropout
+
+
+class VoxtralConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`VoxtralForConditionalGeneration`]. It is used to instantiate an
+    Voxtral model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Voxtral-Mini-3B.
+
+    e.g. [mistralai/Voxtral-Mini-3B-2507](https://huggingface.co/mistralai/Voxtral-Mini-3B-2507)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        audio_config (`Union[AutoConfig, dict]`, *optional*):
+            The config object or dictionary of the audio encoder.
+        text_config (`Union[AutoConfig, dict]`, *optional*):
+            The config object or dictionary of the text model.
+        audio_token_id (`int`, *optional*):
+            The image token index to encode the image prompt.
+        projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The activation function (function or string) in the multi-modal projector.
+
+    ```python
+    >>> from transformers import VoxtralForConditionalGeneration, VoxtralConfig
+
+    >>> # Initializing a Voxtral configuration
+    >>> configuration = VoxtralConfig(audio_token_id=24, projector_hidden_act="gelu")
+
+    >>> # Initializing a 3B model with random weights
+    >>> model = VoxtralForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "voxtral"
+    sub_configs = {"text_config": AutoConfig, "audio_config": AutoConfig}
+
+    _default_text_config_kwargs = {
+        "vocab_size": 131072,
+        "hidden_size": 3072,
+        "intermediate_size": 8192,
+        "num_hidden_layers": 30,
+        "num_key_value_heads": 8,
+        "max_position_embeddings": 131072,
+        "rms_norm_eps": 1e-05,
+        "use_cache": True,
+        "rope_theta": 100000000.0,
+        "head_dim": 128,
+    }
+
+    def __init__(
+        self,
+        audio_config=None,
+        text_config=None,
+        audio_token_id=None,
+        projector_hidden_act="gelu",
+        **kwargs,
+    ):
+        if isinstance(audio_config, dict):
+            audio_config["model_type"] = audio_config.get("model_type", "voxtral_encoder")
+            audio_config = CONFIG_MAPPING[audio_config["model_type"]](**audio_config)
+        elif audio_config is None:
+            audio_config = CONFIG_MAPPING["voxtral_encoder"]()
+        self.audio_config = audio_config
+
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config.get("model_type", "llama")
+            text_config = CONFIG_MAPPING[text_config["model_type"]](
+                **{**self._default_text_config_kwargs, **text_config}
+            )
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["llama"](**self._default_text_config_kwargs)
+        self.text_config = text_config
+
+        self.vocab_size = text_config.vocab_size
+        self.hidden_size = text_config.hidden_size
+        self.audio_token_id = audio_token_id
+        self.projector_hidden_act = projector_hidden_act
+
+        super().__init__(**kwargs)
+
+
+__all__ = ["VoxtralEncoderConfig", "VoxtralConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/voxtral/modeling_voxtral.py b/venv/lib/python3.13/site-packages/transformers/models/voxtral/modeling_voxtral.py
new file mode 100644
index 0000000000000000000000000000000000000000..be8f9f277bb8c46edbf49facfbb9c0a5808436a7
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/voxtral/modeling_voxtral.py
@@ -0,0 +1,551 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/voxtral/modular_voxtral.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_voxtral.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import warnings
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache
+from ...generation import GenerationMixin
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ...utils.generic import check_model_inputs
+from ..auto import AutoModel, AutoModelForCausalLM
+from .configuration_voxtral import VoxtralConfig, VoxtralEncoderConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: Optional[float] = None,
+    dropout: float = 0.0,
+    head_mask: Optional[torch.Tensor] = None,
+    **kwargs,
+):
+    if scaling is None:
+        scaling = query.size(-1) ** -0.5
+
+    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
+    if attention_mask is not None and attention_mask.ndim == 4:
+        attn_weights = attn_weights + attention_mask[:, :, :, : key.shape[-2]]
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+    if head_mask is not None:
+        attn_weights = attn_weights * head_mask.view(1, -1, 1, 1)
+
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class VoxtralAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        is_causal: bool = False,
+        layer_idx: Optional[int] = None,
+        config: Optional[VoxtralConfig] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.config = config
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.is_causal = is_causal
+
+        if layer_idx is None and is_decoder:
+            logger.warning_once(
+                f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and "
+                "will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.layer_idx = layer_idx
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # Scaling is susceptible to floating point arithmetics' inprecisions
+        # which can lead to different results (this is dependent from model
+        # to model, e.g. whisper is one such case). We therefore keep the
+        # original order of scaling to follow the original implementation
+        # and enforce no scaling (1.0) in the attention call below.
+        query_states = self._shape(self.q_proj(hidden_states) * self.scaling, tgt_len, bsz)
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.dropout,
+            scaling=1.0,
+            output_attentions=output_attentions,
+            head_mask=layer_head_mask,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous()
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+class VoxtralEncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: VoxtralConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = VoxtralAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+            config=config,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_head_mask: torch.Tensor,
+        output_attentions: bool = False,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        if hidden_states.dtype == torch.float16:
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        return hidden_states, attn_weights
+
+
+@auto_docstring
+class VoxtralPreTrainedModel(PreTrainedModel):
+    config: VoxtralConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = None
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _supports_cache_class = True
+    _supports_attention_backend = True
+    _can_compile_fullgraph = True
+
+    def _init_weights(self, module):
+        # important: this ported version of Voxtral isn't meant for training from scratch - only
+        # inference and fine-tuning - so the proper init weights code has been removed
+        std = (
+            self.config.initializer_range
+            if hasattr(self.config, "initializer_range")
+            else self.config.audio_config.initializer_range
+        )
+
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.weight.data.fill_(1.0)
+            module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+@auto_docstring(
+    custom_intro="""
+    The Voxtral encoder, which is a Whisper encoder.
+    """
+)
+class VoxtralEncoder(VoxtralPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`VoxtralEncoderLayer`].
+
+    Args:
+        config: VoxtralEncoderConfig
+    """
+
+    # Ignore copy
+    config: VoxtralEncoderConfig
+    main_input_name = "input_features"
+    _no_split_modules = ["VoxtralEncoderLayer"]
+    _can_record_outputs = {
+        "attentions": VoxtralAttention,
+        "hidden_states": VoxtralEncoderLayer,
+    }
+
+    def __init__(self, config: VoxtralEncoderConfig):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+
+        embed_dim = config.d_model
+        self.num_mel_bins = config.num_mel_bins
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_source_positions
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+
+        self.conv1 = nn.Conv1d(self.num_mel_bins, embed_dim, kernel_size=3, padding=1)
+        self.conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=2, padding=1)
+
+        self.embed_positions = nn.Embedding(self.max_source_positions, embed_dim)
+        self.embed_positions.requires_grad_(False)
+
+        self.layers = nn.ModuleList([VoxtralEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layer_norm = nn.LayerNorm(config.d_model)
+        # Ignore copy
+        self.avg_pooler = nn.AvgPool1d(2, stride=2)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _freeze_parameters(self):
+        for param in self.parameters():
+            param.requires_grad = False
+        self._requires_grad = False
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.conv1
+
+    def set_input_embeddings(self, value: nn.Module):
+        self.conv1 = value
+
+    @check_model_inputs()
+    def forward(
+        self,
+        input_features,
+        attention_mask=None,
+        **kwargs: Unpack[TransformersKwargs],
+    ):
+        r"""
+        Args:
+            input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
+                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
+                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
+                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
+                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
+                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
+            attention_mask (`torch.Tensor`)`, *optional*):
+                Voxtral does not support masking of the `input_features`, this argument is preserved for compatibility,
+                but it is not used. By default the silence in the input log mel spectrogram are ignored.
+        """
+        expected_seq_length = self.config.max_source_positions * self.conv1.stride[0] * self.conv2.stride[0]
+        if input_features.shape[-1] != expected_seq_length:
+            raise ValueError(
+                f"Qwen2Audio expects the mel input features to be of length {expected_seq_length}, but found {input_features.shape[-1]}. Make sure to pad the input mel features to {expected_seq_length}."
+            )
+
+        input_features = input_features.to(dtype=self.conv1.weight.dtype, device=self.conv1.weight.device)
+        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
+        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
+        inputs_embeds = inputs_embeds.permute(0, 2, 1)
+
+        embed_pos = self.embed_positions.weight
+        hidden_states = (inputs_embeds + embed_pos).to(inputs_embeds.dtype)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        for idx, encoder_layer in enumerate(self.layers):
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                layer_head_mask=None,
+            )
+            hidden_states = layer_outputs[0]
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+        )
+
+    # Ignore copy
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
+        """
+        Computes the output length of the convolutional layers and the output length of the audio encoder
+        """
+        input_lengths = (input_lengths - 1) // 2 + 1
+        output_lengths = (input_lengths - 2) // 2 + 1
+        return input_lengths, output_lengths
+
+
+class VoxtralMultiModalProjector(nn.Module):
+    def __init__(self, config: VoxtralConfig):
+        super().__init__()
+        self.linear_1 = nn.Linear(config.audio_config.intermediate_size, config.text_config.hidden_size, bias=False)
+        self.act = ACT2FN[config.projector_hidden_act]
+        self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=False)
+
+    def forward(self, audio_features):
+        hidden_states = self.linear_1(audio_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+@auto_docstring(
+    custom_intro="""
+    The Voxtral model, which consists of Whisper encoder, a multi-modal projector and a LLama language model.
+    """
+)
+class VoxtralForConditionalGeneration(VoxtralPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    _keep_in_fp32_modules_strict = ["embed_positions"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.vocab_size = config.text_config.vocab_size
+        self.audio_tower = AutoModel.from_config(config.audio_config)
+        self.language_model = AutoModelForCausalLM.from_config(config.text_config)
+        self.multi_modal_projector = VoxtralMultiModalProjector(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+
+    def set_decoder(self, decoder):
+        self.language_model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+
+    def get_audio_features(self, input_features: torch.FloatTensor):
+        """
+        This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.
+        Args:
+            input_features (`torch.FloatTensor`):
+                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
+                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
+                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
+                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
+                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
+
+        Returns:
+            `torch.FloatTensor`:
+                The audio embeddings.
+        """
+        audio_outputs = self.audio_tower(input_features)
+        audio_hidden_states = audio_outputs.last_hidden_state
+        audio_hidden_states = audio_hidden_states.reshape(-1, self.config.audio_config.intermediate_size)
+        audio_embeds = self.multi_modal_projector(audio_hidden_states)
+        return audio_embeds
+
+    def get_audio_embeds(self, input_features: torch.FloatTensor):
+        warnings.warn(
+            "The method `get_audio_embeds` is deprecated. Please use `get_audio_features` instead.", FutureWarning
+        )
+        return self.get_audio_features(input_features)
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        input_features: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import VoxtralForConditionalGeneration, AutoProcessor
+        >>> import torch
+
+        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
+        >>> repo_id = "mistralai/Voxtral-Mini-3B-2507"
+
+        >>> processor = AutoProcessor.from_pretrained(repo_id)
+        >>> model = VoxtralForConditionalGeneration.from_pretrained(repo_id, dtype=torch.bfloat16, device_map=device)
+
+        >>> conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "audio",
+                        "url": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/dude_where_is_my_car.wav",
+                    },
+                    {"type": "text", "text": "What can you tell me about this audio?"},
+                ],
+            }
+        ]
+
+        >>> inputs = processor.apply_chat_template(conversation)
+        >>> inputs = inputs.to(device, dtype=torch.bfloat16)
+
+        >>> outputs = model.generate(**inputs, max_new_tokens=30)
+        >>> processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
+        ["This audio is a humorous conversation between two friends, likely in English, where one of them is trying to figure out what the other's tattoo says."]
+        ```"""
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if input_features is not None and input_ids is not None:
+            audio_embeds = self.get_audio_features(input_features)
+
+            # replace text-audio token placeholders with audio embeddings
+            audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1)
+            inputs_embeds = inputs_embeds.masked_scatter(
+                audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device)
+            )
+
+        outputs: BaseModelOutputWithPast = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+        return outputs
+
+    def prepare_inputs_for_generation(self, *args, **kwargs):
+        # Overwritten -- we should not pass input_features when we are in cached decoding stage
+
+        input_features = kwargs.pop("input_features", None)
+        cache_position = kwargs.get("cache_position")
+
+        model_inputs = super().prepare_inputs_for_generation(*args, **kwargs)
+
+        if cache_position is not None and cache_position[0] == 0:
+            # input_features should only be passed when we are not in cached decoding stage
+            model_inputs["input_features"] = input_features
+
+        return model_inputs
+
+
+__all__ = ["VoxtralPreTrainedModel", "VoxtralEncoder", "VoxtralForConditionalGeneration"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/voxtral/modular_voxtral.py b/venv/lib/python3.13/site-packages/transformers/models/voxtral/modular_voxtral.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a65ca999a2e251ce96def581dbfc51d0305cb2a
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/voxtral/modular_voxtral.py
@@ -0,0 +1,286 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache
+from ...generation import GenerationMixin
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ...utils.generic import check_model_inputs
+from ..auto import AutoModel, AutoModelForCausalLM
+from ..qwen2_audio.modeling_qwen2_audio import (
+    Qwen2AudioAttention,
+    Qwen2AudioEncoder,
+    Qwen2AudioEncoderLayer,
+    Qwen2AudioPreTrainedModel,
+)
+from .configuration_voxtral import VoxtralConfig
+
+
+class VoxtralAttention(Qwen2AudioAttention):
+    pass
+
+
+class VoxtralEncoderLayer(Qwen2AudioEncoderLayer):
+    pass
+
+
+class VoxtralPreTrainedModel(Qwen2AudioPreTrainedModel):
+    _supports_flex_attn = True
+    _supports_cache_class = True
+    _supports_attention_backend = True
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _no_split_modules = None
+
+
+# TODO: @eustlb, I would really prefer to use WhisperEncoder but it's messing with modular
+@auto_docstring(
+    custom_intro="""
+    The Voxtral encoder, which is a Whisper encoder.
+    """
+)
+class VoxtralEncoder(Qwen2AudioEncoder):
+    _can_record_outputs = {
+        "attentions": VoxtralAttention,
+        "hidden_states": VoxtralEncoderLayer,
+    }
+
+    @check_model_inputs()
+    def forward(
+        self,
+        input_features,
+        attention_mask=None,
+        **kwargs: Unpack[TransformersKwargs],
+    ):
+        r"""
+        Args:
+            input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
+                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
+                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
+                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
+                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
+                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
+            attention_mask (`torch.Tensor`)`, *optional*):
+                Voxtral does not support masking of the `input_features`, this argument is preserved for compatibility,
+                but it is not used. By default the silence in the input log mel spectrogram are ignored.
+        """
+        expected_seq_length = self.config.max_source_positions * self.conv1.stride[0] * self.conv2.stride[0]
+        if input_features.shape[-1] != expected_seq_length:
+            raise ValueError(
+                f"Qwen2Audio expects the mel input features to be of length {expected_seq_length}, but found {input_features.shape[-1]}. Make sure to pad the input mel features to {expected_seq_length}."
+            )
+
+        input_features = input_features.to(dtype=self.conv1.weight.dtype, device=self.conv1.weight.device)
+        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
+        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
+        inputs_embeds = inputs_embeds.permute(0, 2, 1)
+
+        embed_pos = self.embed_positions.weight
+        hidden_states = (inputs_embeds + embed_pos).to(inputs_embeds.dtype)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        for idx, encoder_layer in enumerate(self.layers):
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                layer_head_mask=None,
+            )
+            hidden_states = layer_outputs[0]
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+        )
+
+
+class VoxtralMultiModalProjector(nn.Module):
+    def __init__(self, config: VoxtralConfig):
+        super().__init__()
+        self.linear_1 = nn.Linear(config.audio_config.intermediate_size, config.text_config.hidden_size, bias=False)
+        self.act = ACT2FN[config.projector_hidden_act]
+        self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=False)
+
+    def forward(self, audio_features):
+        hidden_states = self.linear_1(audio_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+@auto_docstring(
+    custom_intro="""
+    The Voxtral model, which consists of Whisper encoder, a multi-modal projector and a LLama language model.
+    """
+)
+class VoxtralForConditionalGeneration(VoxtralPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    _keep_in_fp32_modules_strict = ["embed_positions"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.vocab_size = config.text_config.vocab_size
+        self.audio_tower = AutoModel.from_config(config.audio_config)
+        self.language_model = AutoModelForCausalLM.from_config(config.text_config)
+        self.multi_modal_projector = VoxtralMultiModalProjector(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+
+    def set_decoder(self, decoder):
+        self.language_model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+
+    def get_audio_features(self, input_features: torch.FloatTensor):
+        """
+        This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.
+        Args:
+            input_features (`torch.FloatTensor`):
+                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
+                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
+                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
+                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
+                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
+
+        Returns:
+            `torch.FloatTensor`:
+                The audio embeddings.
+        """
+        audio_outputs = self.audio_tower(input_features)
+        audio_hidden_states = audio_outputs.last_hidden_state
+        audio_hidden_states = audio_hidden_states.reshape(-1, self.config.audio_config.intermediate_size)
+        audio_embeds = self.multi_modal_projector(audio_hidden_states)
+        return audio_embeds
+
+    def get_audio_embeds(self, input_features: torch.FloatTensor):
+        warnings.warn(
+            "The method `get_audio_embeds` is deprecated. Please use `get_audio_features` instead.", FutureWarning
+        )
+        return self.get_audio_features(input_features)
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        input_features: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import VoxtralForConditionalGeneration, AutoProcessor
+        >>> import torch
+
+        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
+        >>> repo_id = "mistralai/Voxtral-Mini-3B-2507"
+
+        >>> processor = AutoProcessor.from_pretrained(repo_id)
+        >>> model = VoxtralForConditionalGeneration.from_pretrained(repo_id, dtype=torch.bfloat16, device_map=device)
+
+        >>> conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "audio",
+                        "url": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/dude_where_is_my_car.wav",
+                    },
+                    {"type": "text", "text": "What can you tell me about this audio?"},
+                ],
+            }
+        ]
+
+        >>> inputs = processor.apply_chat_template(conversation)
+        >>> inputs = inputs.to(device, dtype=torch.bfloat16)
+
+        >>> outputs = model.generate(**inputs, max_new_tokens=30)
+        >>> processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
+        ["This audio is a humorous conversation between two friends, likely in English, where one of them is trying to figure out what the other's tattoo says."]
+        ```"""
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if input_features is not None and input_ids is not None:
+            audio_embeds = self.get_audio_features(input_features)
+
+            # replace text-audio token placeholders with audio embeddings
+            audio_token_mask = (input_ids == self.config.audio_token_id).unsqueeze(-1)
+            inputs_embeds = inputs_embeds.masked_scatter(
+                audio_token_mask.to(inputs_embeds.device), audio_embeds.to(inputs_embeds.device)
+            )
+
+        outputs: BaseModelOutputWithPast = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+        return outputs
+
+    def prepare_inputs_for_generation(self, *args, **kwargs):
+        # Overwritten -- we should not pass input_features when we are in cached decoding stage
+
+        input_features = kwargs.pop("input_features", None)
+        cache_position = kwargs.get("cache_position")
+
+        model_inputs = super().prepare_inputs_for_generation(*args, **kwargs)
+
+        if cache_position is not None and cache_position[0] == 0:
+            # input_features should only be passed when we are not in cached decoding stage
+            model_inputs["input_features"] = input_features
+
+        return model_inputs
+
+
+__all__ = ["VoxtralPreTrainedModel", "VoxtralEncoder", "VoxtralForConditionalGeneration"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/voxtral/processing_voxtral.py b/venv/lib/python3.13/site-packages/transformers/models/voxtral/processing_voxtral.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fd3515c0af9fc5e11d4a8271f97f4f44f99e40b
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/voxtral/processing_voxtral.py
@@ -0,0 +1,435 @@
+# coding=utf-8
+# Copyright 2025 Sesame and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import io
+from typing import Optional, Union
+
+from ...utils import is_mistral_common_available, is_soundfile_available, is_torch_available, logging
+
+
+if is_torch_available():
+    import torch
+
+if is_soundfile_available():
+    import soundfile as sf
+
+if is_mistral_common_available():
+    from mistral_common.protocol.transcription.request import TranscriptionRequest
+
+from ...audio_utils import AudioInput, load_audio_as, make_list_of_audio
+from ...feature_extraction_utils import BatchFeature
+from ...processing_utils import AllKwargsForChatTemplate, AudioKwargs, ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+
+
+logger = logging.get_logger(__name__)
+
+
+class VoxtralAudioKwargs(AudioKwargs, total=False):
+    max_source_positions: Optional[int]
+
+
+class VoxtralProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": True,
+        },
+        "audio_kwargs": {
+            "sampling_rate": 16000,
+            "padding": True,
+            "truncation": False,
+            "pad_to_multiple_of": 480000,
+            "max_source_positions": 3000,
+        },
+        "common_kwargs": {
+            "return_tensors": "pt",
+            "return_dict": True,
+            "tokenize": True,
+        },
+    }
+
+
+class VoxtralProcessor(ProcessorMixin):
+    r"""
+    Constructs a Voxtral processor which wraps [`WhisperFeatureExtractor`] and
+    [`MistralCommonTokenizer`] into a single processor that inherits both the audio feature extraction and
+    tokenizer functionalities.
+
+    Args:
+        feature_extractor ([`WhisperFeatureExtractor`]):
+            The feature extractor is a required input.
+        tokenizer ([`MistralCommonTokenizer`]):
+            The tokenizer is a required input.
+    """
+
+    attributes = ["feature_extractor", "tokenizer"]
+    feature_extractor_class = "WhisperFeatureExtractor"
+    tokenizer_class = "MistralCommonTokenizer"
+
+    def __init__(
+        self,
+        feature_extractor,
+        tokenizer,
+    ):
+        self.audio_token_id = 24
+        self.audio_token = tokenizer.convert_ids_to_tokens(self.audio_token_id)
+
+        super().__init__(feature_extractor, tokenizer)
+
+    def _retrieve_input_features(self, audio, max_source_positions, **kwargs):
+        """
+        Handles specific logic of Voxtral expected input features: audio arrays should be padded to next multiple of 480000 (duration is a multiple of 30s), see VoxtralProcessorKwargs' default audio_kwargs.
+        Then mel input features are extracted and stacked along batch dimension, splitting into chunks of max_source_positions.
+        """
+        input_features_list = []
+        for audio_array in audio:
+            audio_inputs = self.feature_extractor(audio_array, **kwargs)
+
+            # let's split into chunks of max_source_positions, and then stack them along batch dimension
+            input_features = audio_inputs["input_features"].reshape(
+                self.feature_extractor.feature_size, -1, max_source_positions
+            )
+            input_features_list.append(input_features.transpose(0, 1))
+
+        return torch.cat(input_features_list)
+
+    def apply_chat_template(
+        self,
+        conversation: Union[list[dict[str, str]], list[list[dict[str, str]]]],
+        **kwargs: Unpack[AllKwargsForChatTemplate],
+    ) -> str:
+        """
+        This method applies the model's chat completion template given a conversation. It relies on MistralCommonTokenizer's
+        [`~MistralCommonTokenizer.apply_chat_template`] to prepare input ids to the model and on WhisperFeatureExtractor's
+        [`~WhisperFeatureExtractor.__call__`] to prepare input features to the model.
+
+        Note that audio is padded to the nearest 30-second multiple prior to mel feature extraction.
+
+        A `conversation` is a list of messages, where each message is a dictionary with a `role` and a `content` field.
+        For Voxtral, `role` can be `"user"` or `"assistant"`.
+        The `content` field can be a string or a list of dictionaries with a `type` field. See example below.
+
+        ```python
+        from huggingface_hub import hf_hub_download
+        from transformers.audio_utils import load_audio_as
+
+        audio_url = "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/bcn_weather.mp3"
+        audio_path = hf_hub_download(repo_id="hf-internal-testing/dummy-audio-samples", filename="bcn_weather.mp3", repo_type="dataset")
+        audio_base64 = load_audio_as(audio_path, return_format="base64", force_mono=True)
+
+        # audio + text
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "audio", "url": audio_url},
+                    {"type": "audio", "path": audio_path},
+                    {"type": "audio", "base64": audio_base64},
+                    {"type": "text", "text": "How many audio do you hear?"},
+                ],
+            },
+        ]
+
+        processor = VoxtralProcessor.from_pretrained("mistralai/Voxtral-Mini-3B-2507")
+        inputs = processor.apply_chat_template(conversation)
+        ```
+
+        Args:
+            conversation (`Union[list[Dict, [str, str]], list[list[dict[str, str]]]]`):
+                The conversation to format.
+        """
+        if kwargs.get("continue_final_message", False):
+            if kwargs.get("add_generation_prompt", False):
+                raise ValueError(
+                    "continue_final_message and add_generation_prompt are not compatible. Use continue_final_message when you want the model to continue the final message, and add_generation_prompt when you want to add a header that will prompt it to start a new assistant message instead."
+                )
+            if kwargs.get("return_assistant_tokens_mask", False):
+                raise ValueError("continue_final_message is not compatible with return_assistant_tokens_mask.")
+
+        # Fill sets of kwargs that should be used by different parts of template
+        processed_kwargs = {
+            "mm_load_kwargs": {},
+            "template_kwargs": {},
+        }
+
+        for kwarg_type in processed_kwargs:
+            for key in AllKwargsForChatTemplate.__annotations__[kwarg_type].__annotations__:
+                kwarg_type_defaults = AllKwargsForChatTemplate.__annotations__[kwarg_type]
+                default_value = getattr(kwarg_type_defaults, key, None)
+                value = kwargs.pop(key, default_value)
+                if value is not None and not isinstance(value, dict):
+                    processed_kwargs[kwarg_type][key] = value
+
+        # Pass unprocessed custom kwargs
+        processed_kwargs["template_kwargs"].update(kwargs)
+
+        if isinstance(conversation, (list, tuple)) and (
+            isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "content")
+        ):
+            is_batched = True
+            conversations = conversation
+        else:
+            is_batched = False
+            conversations = [conversation]
+
+        # Check for any overlapping keys between mm_load_kwargs and kwargs
+        mm_load_kwargs = processed_kwargs["mm_load_kwargs"]
+        if any(key in kwargs for key in mm_load_kwargs):
+            overlapping_keys = [key for key in mm_load_kwargs if key in kwargs]
+            logger.warning(
+                f"{overlapping_keys[0] if len(overlapping_keys) == 1 else ', '.join(overlapping_keys)} load multimodal data kwarg{'s' if len(overlapping_keys) > 1 else ''} {'have' if len(overlapping_keys) > 1 else 'has'} been passed to the processor, but {'they are' if len(overlapping_keys) > 1 else 'it is'} not supported for VoxtralProcessor since it relies on mistral_common directly. {'They' if len(overlapping_keys) > 1 else 'It'} will be ignored."
+            )
+
+        output_kwargs = self._merge_kwargs(
+            VoxtralProcessorKwargs,
+            **kwargs,
+        )
+        text_kwargs = output_kwargs["text_kwargs"]
+        audio_kwargs = output_kwargs["audio_kwargs"]
+        common_kwargs = output_kwargs["common_kwargs"]
+
+        return_tensors = common_kwargs.pop("return_tensors", None)
+        if return_tensors != "pt":
+            raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.")
+
+        tokenizer_kwargs = {**processed_kwargs["template_kwargs"], **text_kwargs}
+        tokenizer_kwargs["return_tensors"] = None  # let's not return tensors here
+        tokenize = tokenizer_kwargs.pop("tokenize", False)
+        return_dict = tokenizer_kwargs.pop("return_dict", False)
+
+        encoded_instruct_inputs = self.tokenizer.apply_chat_template(
+            conversations,
+            tokenize=tokenize,
+            return_dict=return_dict,
+            **tokenizer_kwargs,
+        )
+
+        if tokenize:
+            if return_dict:
+                audio = encoded_instruct_inputs.pop("audio", None)
+                data = dict(encoded_instruct_inputs)
+                if audio is not None:
+                    max_source_positions = audio_kwargs.pop("max_source_positions")
+                    data["input_features"] = self._retrieve_input_features(audio, max_source_positions, **audio_kwargs)
+
+                return BatchFeature(data=data, tensor_type=return_tensors)
+
+        if not is_batched:
+            return encoded_instruct_inputs[0]
+
+        return encoded_instruct_inputs
+
+    def __call__(
+        self,
+        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]],
+        **kwargs: Unpack[VoxtralProcessorKwargs],
+    ):
+        r"""
+        Method to prepare text to be fed as input to the model. This method forwards the `text`
+        arguments to MistralCommonTokenizer's [`~MistralCommonTokenizer.__call__`] to encode
+        the text. Please refer to the docstring of the above methods for more information.
+        This methods does not support audio. To prepare the audio, please use:
+        1. `apply_chat_template` [`~VoxtralProcessor.apply_chat_template`] method.
+        2. `apply_transcription_request` [`~VoxtralProcessor.apply_transcription_request`] method.
+
+        Args:
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                    - `'tf'`: Return TensorFlow `tf.constant` objects.
+                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                    - `'np'`: Return NumPy `np.ndarray` objects.
+                    - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **input_features** -- List of audio values to be fed to a model. Returned when `audio` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+        """
+        if isinstance(text, str):
+            text = [text]
+
+        if any(self.audio_token in t for t in text):
+            raise ValueError(
+                f"{self.audio_token} is present in the provided text which is not supported by VoxtralProcessor. Please use the `apply_chat_template` method instead."
+            )
+
+        output_kwargs = self._merge_kwargs(
+            VoxtralProcessorKwargs,
+            **kwargs,
+        )
+        text_kwargs = output_kwargs["text_kwargs"]
+        common_kwargs = output_kwargs["common_kwargs"]
+
+        out = self.tokenizer(text, **text_kwargs)
+
+        return BatchFeature(data=out, tensor_type=common_kwargs.pop("return_tensors", None))
+
+    # TODO: @eustlb, this should be moved to mistral_common + testing
+    def apply_transcription_request(
+        self,
+        language: Union[str, list[str]],
+        audio: Union[str, list[str], AudioInput],
+        model_id: str,
+        sampling_rate: Optional[int] = None,
+        format: Optional[Union[str, list[str]]] = None,
+        **kwargs: Unpack[VoxtralProcessorKwargs],
+    ):
+        """
+        This method applies the model's transcription request template given a language and audio.
+        It relies on MistralCommonTokenizer and WhisperFeatureExtractor to prepare input ids and input features to the model.
+
+        ```python
+        from transformers import VoxtralProcessor
+
+        model_id = "mistralai/Voxtral-Mini-3B-2507"
+        processor = VoxtralProcessor.from_pretrained(model_id)
+
+        language = "en"
+        audio = "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3"
+
+        inputs = processor.apply_transcription_request(language=language, audio=audio, model_id=model_id)
+        ```
+
+        Args:
+            language (`str`, `list[str]`):
+                The language or languages of the audio. If provided as a string, will be applied uniformly to all audio.
+                If provided as a list, will be applied to each audio individually with a one-to-one mapping.
+            audio (`str`, `list[str]`, `np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The audio or batch of audio to be prepared. If provided as a string, it should correspond to the path or url of the audio file.
+            model_id (`str`:
+                The hub model id of the model to use for transcription.
+            sampling_rate (`int`, *optional*):
+                The sampling rate of the audio. Necessary if it is provided as `np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`.
+                Used to avoid silent errors when passing audio that is not in the expected sampling rate.
+            format (`str`, `list[str]`, *optional*):
+                The format of the audio, necessary if is provided as `np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`.
+        """
+        output_kwargs = self._merge_kwargs(
+            VoxtralProcessorKwargs,
+            **kwargs,
+        )
+        text_kwargs = output_kwargs["text_kwargs"]
+        audio_kwargs = output_kwargs["audio_kwargs"]
+        common_kwargs = output_kwargs["common_kwargs"]
+
+        is_str = isinstance(audio, str)
+        is_list_of_str = all(isinstance(el, str) for el in audio)
+        is_list_of_audio = not (is_str or is_list_of_str)
+
+        if is_list_of_audio:
+            if sampling_rate is None:
+                logger.warning_once(
+                    f"You've provided audio without specifying the sampling rate. It will be assumed to be {audio_kwargs['sampling_rate']}, which can result in silent errors."
+                )
+            elif sampling_rate != audio_kwargs["sampling_rate"]:
+                raise ValueError(
+                    f"The sampling rate of the audio ({sampling_rate}) does not match the sampling rate of the processor ({audio_kwargs['sampling_rate']}). Please provide resampled the audio to the expected sampling rate."
+                )
+
+        sampling_rate = audio_kwargs["sampling_rate"]
+        return_dict = common_kwargs.pop("return_dict", False)
+        tokenize = common_kwargs.pop("tokenize", False)
+
+        # make sure to remove from text_kwargs and audio_kwargs
+        for k in ("return_dict", "tokenize"):
+            text_kwargs.pop(k, None)
+            audio_kwargs.pop(k, None)
+
+        return_tensors = common_kwargs.pop("return_tensors", None)
+        if return_tensors != "pt":
+            raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.")
+
+        # validate audio input
+        if is_str:
+            audio = [load_audio_as(audio, return_format="buffer", force_mono=True, sampling_rate=sampling_rate)]
+        elif is_list_of_str:
+            audio = [
+                load_audio_as(el, return_format="buffer", force_mono=True, sampling_rate=sampling_rate) for el in audio
+            ]
+        else:
+            audio = make_list_of_audio(audio)
+            if len(audio) != len(format):
+                raise ValueError(
+                    f"When passed as a list of audio, the length ({len(audio)}) must match the number of format ({len(format)})"
+                )
+            audio_buffers = []
+            for array, f in zip(audio, format):
+                # Create new BytesIO object and write audio data to it
+                buffer = io.BytesIO()
+                # Convert to mono if needed
+                if array.ndim == 2:
+                    array = array.mean(axis=1)
+                # Write to buffer with default format and sampling rate
+                sf.write(buffer, array, samplerate=audio_kwargs["sampling_rate"], format=f)
+                buffer.seek(0)
+                audio_buffers.append(buffer)
+            audio = audio_buffers
+
+        # validate language input
+        n_audio = len(audio)
+        if isinstance(language, str):
+            language = [language] * n_audio
+
+        if len(language) != n_audio:
+            raise ValueError(
+                f"When passed as a list of languages, the length ({len(language)}) must match the number of audio ({n_audio})"
+            )
+
+        input_ids = []
+        texts = []
+        audio_arrays = []
+        for audio_el, language_el in zip(audio, language):
+            openai_transcription_request = {
+                "model": model_id,
+                "file": audio_el,
+                "language": language_el,
+            }
+
+            transcription_request = TranscriptionRequest.from_openai(openai_transcription_request)
+            tokenized_transcription_request = self.tokenizer.tokenizer.encode_transcription(transcription_request)
+
+            input_ids.append(tokenized_transcription_request.tokens)
+            texts.append(tokenized_transcription_request.text)
+            audio_arrays.extend([el.audio_array for el in tokenized_transcription_request.audios])
+
+        if tokenize:
+            if return_dict:
+                # text are already tokenized but we need to pad etc
+                encoding = self.tokenizer(
+                    input_ids,
+                    add_special_tokens=False,
+                    **text_kwargs,
+                )
+                data = dict(encoding)
+
+                # extract the input features
+                max_source_positions = audio_kwargs.pop("max_source_positions")
+                data["input_features"] = self._retrieve_input_features(
+                    audio_arrays, max_source_positions, **audio_kwargs
+                )
+
+                return BatchFeature(data=data, tensor_type=return_tensors)
+
+        return texts
+
+
+__all__ = ["VoxtralProcessor"]
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/__init__.py b/venv/lib/python3.13/site-packages/transformers/pipelines/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..32e9c276e98de0886de8bb2ef2507fd3891c3038
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/__init__.py
@@ -0,0 +1,1229 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+import warnings
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+from huggingface_hub import model_info
+
+from ..configuration_utils import PretrainedConfig
+from ..dynamic_module_utils import get_class_from_dynamic_module
+from ..feature_extraction_utils import PreTrainedFeatureExtractor
+from ..image_processing_utils import BaseImageProcessor
+from ..models.auto.configuration_auto import AutoConfig
+from ..models.auto.feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING, AutoFeatureExtractor
+from ..models.auto.image_processing_auto import IMAGE_PROCESSOR_MAPPING, AutoImageProcessor
+from ..models.auto.modeling_auto import AutoModelForDepthEstimation, AutoModelForImageToImage
+from ..models.auto.processing_auto import PROCESSOR_MAPPING, AutoProcessor
+from ..models.auto.tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
+from ..processing_utils import ProcessorMixin
+from ..tokenization_utils import PreTrainedTokenizer
+from ..utils import (
+    CONFIG_NAME,
+    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+    cached_file,
+    extract_commit_hash,
+    find_adapter_config_file,
+    is_kenlm_available,
+    is_offline_mode,
+    is_peft_available,
+    is_pyctcdecode_available,
+    is_tf_available,
+    is_torch_available,
+    logging,
+)
+from .audio_classification import AudioClassificationPipeline
+from .automatic_speech_recognition import AutomaticSpeechRecognitionPipeline
+from .base import (
+    ArgumentHandler,
+    CsvPipelineDataFormat,
+    JsonPipelineDataFormat,
+    PipedPipelineDataFormat,
+    Pipeline,
+    PipelineDataFormat,
+    PipelineException,
+    PipelineRegistry,
+    get_default_model_and_revision,
+    infer_framework_load_model,
+)
+from .depth_estimation import DepthEstimationPipeline
+from .document_question_answering import DocumentQuestionAnsweringPipeline
+from .feature_extraction import FeatureExtractionPipeline
+from .fill_mask import FillMaskPipeline
+from .image_classification import ImageClassificationPipeline
+from .image_feature_extraction import ImageFeatureExtractionPipeline
+from .image_segmentation import ImageSegmentationPipeline
+from .image_text_to_text import ImageTextToTextPipeline
+from .image_to_image import ImageToImagePipeline
+from .image_to_text import ImageToTextPipeline
+from .keypoint_matching import KeypointMatchingPipeline
+from .mask_generation import MaskGenerationPipeline
+from .object_detection import ObjectDetectionPipeline
+from .question_answering import QuestionAnsweringArgumentHandler, QuestionAnsweringPipeline
+from .table_question_answering import TableQuestionAnsweringArgumentHandler, TableQuestionAnsweringPipeline
+from .text2text_generation import SummarizationPipeline, Text2TextGenerationPipeline, TranslationPipeline
+from .text_classification import TextClassificationPipeline
+from .text_generation import TextGenerationPipeline
+from .text_to_audio import TextToAudioPipeline
+from .token_classification import (
+    AggregationStrategy,
+    NerPipeline,
+    TokenClassificationArgumentHandler,
+    TokenClassificationPipeline,
+)
+from .video_classification import VideoClassificationPipeline
+from .visual_question_answering import VisualQuestionAnsweringPipeline
+from .zero_shot_audio_classification import ZeroShotAudioClassificationPipeline
+from .zero_shot_classification import ZeroShotClassificationArgumentHandler, ZeroShotClassificationPipeline
+from .zero_shot_image_classification import ZeroShotImageClassificationPipeline
+from .zero_shot_object_detection import ZeroShotObjectDetectionPipeline
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from ..models.auto.modeling_tf_auto import (
+        TFAutoModel,
+        TFAutoModelForCausalLM,
+        TFAutoModelForImageClassification,
+        TFAutoModelForMaskedLM,
+        TFAutoModelForQuestionAnswering,
+        TFAutoModelForSeq2SeqLM,
+        TFAutoModelForSequenceClassification,
+        TFAutoModelForTableQuestionAnswering,
+        TFAutoModelForTokenClassification,
+        TFAutoModelForVision2Seq,
+        TFAutoModelForZeroShotImageClassification,
+    )
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import (
+        AutoModel,
+        AutoModelForAudioClassification,
+        AutoModelForCausalLM,
+        AutoModelForCTC,
+        AutoModelForDocumentQuestionAnswering,
+        AutoModelForImageClassification,
+        AutoModelForImageSegmentation,
+        AutoModelForImageTextToText,
+        AutoModelForKeypointMatching,
+        AutoModelForMaskedLM,
+        AutoModelForMaskGeneration,
+        AutoModelForObjectDetection,
+        AutoModelForQuestionAnswering,
+        AutoModelForSemanticSegmentation,
+        AutoModelForSeq2SeqLM,
+        AutoModelForSequenceClassification,
+        AutoModelForSpeechSeq2Seq,
+        AutoModelForTableQuestionAnswering,
+        AutoModelForTextToSpectrogram,
+        AutoModelForTextToWaveform,
+        AutoModelForTokenClassification,
+        AutoModelForVideoClassification,
+        AutoModelForVision2Seq,
+        AutoModelForVisualQuestionAnswering,
+        AutoModelForZeroShotImageClassification,
+        AutoModelForZeroShotObjectDetection,
+    )
+
+
+if TYPE_CHECKING:
+    from ..modeling_tf_utils import TFPreTrainedModel
+    from ..modeling_utils import PreTrainedModel
+    from ..tokenization_utils_fast import PreTrainedTokenizerFast
+
+
+logger = logging.get_logger(__name__)
+
+
+# Register all the supported tasks here
+TASK_ALIASES = {
+    "sentiment-analysis": "text-classification",
+    "ner": "token-classification",
+    "vqa": "visual-question-answering",
+    "text-to-speech": "text-to-audio",
+}
+SUPPORTED_TASKS = {
+    "audio-classification": {
+        "impl": AudioClassificationPipeline,
+        "tf": (),
+        "pt": (AutoModelForAudioClassification,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("superb/wav2vec2-base-superb-ks", "372e048")}},
+        "type": "audio",
+    },
+    "automatic-speech-recognition": {
+        "impl": AutomaticSpeechRecognitionPipeline,
+        "tf": (),
+        "pt": (AutoModelForCTC, AutoModelForSpeechSeq2Seq) if is_torch_available() else (),
+        "default": {"model": {"pt": ("facebook/wav2vec2-base-960h", "22aad52")}},
+        "type": "multimodal",
+    },
+    "text-to-audio": {
+        "impl": TextToAudioPipeline,
+        "tf": (),
+        "pt": (AutoModelForTextToWaveform, AutoModelForTextToSpectrogram) if is_torch_available() else (),
+        "default": {"model": {"pt": ("suno/bark-small", "1dbd7a1")}},
+        "type": "text",
+    },
+    "feature-extraction": {
+        "impl": FeatureExtractionPipeline,
+        "tf": (TFAutoModel,) if is_tf_available() else (),
+        "pt": (AutoModel,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("distilbert/distilbert-base-cased", "6ea8117"),
+                "tf": ("distilbert/distilbert-base-cased", "6ea8117"),
+            }
+        },
+        "type": "multimodal",
+    },
+    "text-classification": {
+        "impl": TextClassificationPipeline,
+        "tf": (TFAutoModelForSequenceClassification,) if is_tf_available() else (),
+        "pt": (AutoModelForSequenceClassification,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("distilbert/distilbert-base-uncased-finetuned-sst-2-english", "714eb0f"),
+                "tf": ("distilbert/distilbert-base-uncased-finetuned-sst-2-english", "714eb0f"),
+            },
+        },
+        "type": "text",
+    },
+    "token-classification": {
+        "impl": TokenClassificationPipeline,
+        "tf": (TFAutoModelForTokenClassification,) if is_tf_available() else (),
+        "pt": (AutoModelForTokenClassification,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("dbmdz/bert-large-cased-finetuned-conll03-english", "4c53496"),
+                "tf": ("dbmdz/bert-large-cased-finetuned-conll03-english", "4c53496"),
+            },
+        },
+        "type": "text",
+    },
+    "question-answering": {
+        "impl": QuestionAnsweringPipeline,
+        "tf": (TFAutoModelForQuestionAnswering,) if is_tf_available() else (),
+        "pt": (AutoModelForQuestionAnswering,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("distilbert/distilbert-base-cased-distilled-squad", "564e9b5"),
+                "tf": ("distilbert/distilbert-base-cased-distilled-squad", "564e9b5"),
+            },
+        },
+        "type": "text",
+    },
+    "table-question-answering": {
+        "impl": TableQuestionAnsweringPipeline,
+        "pt": (AutoModelForTableQuestionAnswering,) if is_torch_available() else (),
+        "tf": (TFAutoModelForTableQuestionAnswering,) if is_tf_available() else (),
+        "default": {
+            "model": {
+                "pt": ("google/tapas-base-finetuned-wtq", "e3dde19"),
+                "tf": ("google/tapas-base-finetuned-wtq", "e3dde19"),
+            },
+        },
+        "type": "text",
+    },
+    "visual-question-answering": {
+        "impl": VisualQuestionAnsweringPipeline,
+        "pt": (AutoModelForVisualQuestionAnswering,) if is_torch_available() else (),
+        "tf": (),
+        "default": {
+            "model": {"pt": ("dandelin/vilt-b32-finetuned-vqa", "d0a1f6a")},
+        },
+        "type": "multimodal",
+    },
+    "document-question-answering": {
+        "impl": DocumentQuestionAnsweringPipeline,
+        "pt": (AutoModelForDocumentQuestionAnswering,) if is_torch_available() else (),
+        "tf": (),
+        "default": {
+            "model": {"pt": ("impira/layoutlm-document-qa", "beed3c4")},
+        },
+        "type": "multimodal",
+    },
+    "fill-mask": {
+        "impl": FillMaskPipeline,
+        "tf": (TFAutoModelForMaskedLM,) if is_tf_available() else (),
+        "pt": (AutoModelForMaskedLM,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("distilbert/distilroberta-base", "fb53ab8"),
+                "tf": ("distilbert/distilroberta-base", "fb53ab8"),
+            }
+        },
+        "type": "text",
+    },
+    "summarization": {
+        "impl": SummarizationPipeline,
+        "tf": (TFAutoModelForSeq2SeqLM,) if is_tf_available() else (),
+        "pt": (AutoModelForSeq2SeqLM,) if is_torch_available() else (),
+        "default": {
+            "model": {"pt": ("sshleifer/distilbart-cnn-12-6", "a4f8f3e"), "tf": ("google-t5/t5-small", "df1b051")}
+        },
+        "type": "text",
+    },
+    # This task is a special case as it's parametrized by SRC, TGT languages.
+    "translation": {
+        "impl": TranslationPipeline,
+        "tf": (TFAutoModelForSeq2SeqLM,) if is_tf_available() else (),
+        "pt": (AutoModelForSeq2SeqLM,) if is_torch_available() else (),
+        "default": {
+            ("en", "fr"): {"model": {"pt": ("google-t5/t5-base", "a9723ea"), "tf": ("google-t5/t5-base", "a9723ea")}},
+            ("en", "de"): {"model": {"pt": ("google-t5/t5-base", "a9723ea"), "tf": ("google-t5/t5-base", "a9723ea")}},
+            ("en", "ro"): {"model": {"pt": ("google-t5/t5-base", "a9723ea"), "tf": ("google-t5/t5-base", "a9723ea")}},
+        },
+        "type": "text",
+    },
+    "text2text-generation": {
+        "impl": Text2TextGenerationPipeline,
+        "tf": (TFAutoModelForSeq2SeqLM,) if is_tf_available() else (),
+        "pt": (AutoModelForSeq2SeqLM,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("google-t5/t5-base", "a9723ea"), "tf": ("google-t5/t5-base", "a9723ea")}},
+        "type": "text",
+    },
+    "text-generation": {
+        "impl": TextGenerationPipeline,
+        "tf": (TFAutoModelForCausalLM,) if is_tf_available() else (),
+        "pt": (AutoModelForCausalLM,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("openai-community/gpt2", "607a30d"), "tf": ("openai-community/gpt2", "607a30d")}},
+        "type": "text",
+    },
+    "zero-shot-classification": {
+        "impl": ZeroShotClassificationPipeline,
+        "tf": (TFAutoModelForSequenceClassification,) if is_tf_available() else (),
+        "pt": (AutoModelForSequenceClassification,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("facebook/bart-large-mnli", "d7645e1"),
+                "tf": ("FacebookAI/roberta-large-mnli", "2a8f12d"),
+            },
+            "config": {
+                "pt": ("facebook/bart-large-mnli", "d7645e1"),
+                "tf": ("FacebookAI/roberta-large-mnli", "2a8f12d"),
+            },
+        },
+        "type": "text",
+    },
+    "zero-shot-image-classification": {
+        "impl": ZeroShotImageClassificationPipeline,
+        "tf": (TFAutoModelForZeroShotImageClassification,) if is_tf_available() else (),
+        "pt": (AutoModelForZeroShotImageClassification,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("openai/clip-vit-base-patch32", "3d74acf"),
+                "tf": ("openai/clip-vit-base-patch32", "3d74acf"),
+            }
+        },
+        "type": "multimodal",
+    },
+    "zero-shot-audio-classification": {
+        "impl": ZeroShotAudioClassificationPipeline,
+        "tf": (),
+        "pt": (AutoModel,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("laion/clap-htsat-fused", "cca9e28"),
+            }
+        },
+        "type": "multimodal",
+    },
+    "image-classification": {
+        "impl": ImageClassificationPipeline,
+        "tf": (TFAutoModelForImageClassification,) if is_tf_available() else (),
+        "pt": (AutoModelForImageClassification,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("google/vit-base-patch16-224", "3f49326"),
+                "tf": ("google/vit-base-patch16-224", "3f49326"),
+            }
+        },
+        "type": "image",
+    },
+    "image-feature-extraction": {
+        "impl": ImageFeatureExtractionPipeline,
+        "tf": (TFAutoModel,) if is_tf_available() else (),
+        "pt": (AutoModel,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("google/vit-base-patch16-224", "3f49326"),
+                "tf": ("google/vit-base-patch16-224", "3f49326"),
+            }
+        },
+        "type": "image",
+    },
+    "image-segmentation": {
+        "impl": ImageSegmentationPipeline,
+        "tf": (),
+        "pt": (AutoModelForImageSegmentation, AutoModelForSemanticSegmentation) if is_torch_available() else (),
+        "default": {"model": {"pt": ("facebook/detr-resnet-50-panoptic", "d53b52a")}},
+        "type": "multimodal",
+    },
+    "image-to-text": {
+        "impl": ImageToTextPipeline,
+        "tf": (TFAutoModelForVision2Seq,) if is_tf_available() else (),
+        "pt": (AutoModelForVision2Seq,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("ydshieh/vit-gpt2-coco-en", "5bebf1e"),
+                "tf": ("ydshieh/vit-gpt2-coco-en", "5bebf1e"),
+            }
+        },
+        "type": "multimodal",
+    },
+    "image-text-to-text": {
+        "impl": ImageTextToTextPipeline,
+        "tf": (),
+        "pt": (AutoModelForImageTextToText,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", "2c9ba3b"),
+            }
+        },
+        "type": "multimodal",
+    },
+    "object-detection": {
+        "impl": ObjectDetectionPipeline,
+        "tf": (),
+        "pt": (AutoModelForObjectDetection,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("facebook/detr-resnet-50", "1d5f47b")}},
+        "type": "multimodal",
+    },
+    "zero-shot-object-detection": {
+        "impl": ZeroShotObjectDetectionPipeline,
+        "tf": (),
+        "pt": (AutoModelForZeroShotObjectDetection,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("google/owlvit-base-patch32", "cbc355f")}},
+        "type": "multimodal",
+    },
+    "depth-estimation": {
+        "impl": DepthEstimationPipeline,
+        "tf": (),
+        "pt": (AutoModelForDepthEstimation,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("Intel/dpt-large", "bc15f29")}},
+        "type": "image",
+    },
+    "video-classification": {
+        "impl": VideoClassificationPipeline,
+        "tf": (),
+        "pt": (AutoModelForVideoClassification,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("MCG-NJU/videomae-base-finetuned-kinetics", "488eb9a")}},
+        "type": "video",
+    },
+    "mask-generation": {
+        "impl": MaskGenerationPipeline,
+        "tf": (),
+        "pt": (AutoModelForMaskGeneration,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("facebook/sam-vit-huge", "87aecf0")}},
+        "type": "multimodal",
+    },
+    "image-to-image": {
+        "impl": ImageToImagePipeline,
+        "tf": (),
+        "pt": (AutoModelForImageToImage,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("caidas/swin2SR-classical-sr-x2-64", "cee1c92")}},
+        "type": "image",
+    },
+    "keypoint-matching": {
+        "impl": KeypointMatchingPipeline,
+        "tf": (),
+        "pt": (AutoModelForKeypointMatching,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("magic-leap-community/superglue_outdoor", "f4041f8")}},
+        "type": "image",
+    },
+}
+
+PIPELINE_REGISTRY = PipelineRegistry(supported_tasks=SUPPORTED_TASKS, task_aliases=TASK_ALIASES)
+
+
+def get_supported_tasks() -> list[str]:
+    """
+    Returns a list of supported task strings.
+    """
+    return PIPELINE_REGISTRY.get_supported_tasks()
+
+
+def get_task(model: str, token: Optional[str] = None, **deprecated_kwargs) -> str:
+    use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+            FutureWarning,
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
+    if is_offline_mode():
+        raise RuntimeError("You cannot infer task automatically within `pipeline` when using offline mode")
+    try:
+        info = model_info(model, token=token)
+    except Exception as e:
+        raise RuntimeError(f"Instantiating a pipeline without a task set raised an error: {e}")
+    if not info.pipeline_tag:
+        raise RuntimeError(
+            f"The model {model} does not seem to have a correct `pipeline_tag` set to infer the task automatically"
+        )
+    if getattr(info, "library_name", "transformers") not in {"transformers", "timm"}:
+        raise RuntimeError(f"This model is meant to be used with {info.library_name} not with transformers")
+    task = info.pipeline_tag
+    return task
+
+
+def check_task(task: str) -> tuple[str, dict, Any]:
+    """
+    Checks an incoming task string, to validate it's correct and return the default Pipeline and Model classes, and
+    default models if they exist.
+
+    Args:
+        task (`str`):
+            The task defining which pipeline will be returned. Currently accepted tasks are:
+
+            - `"audio-classification"`
+            - `"automatic-speech-recognition"`
+            - `"conversational"`
+            - `"depth-estimation"`
+            - `"document-question-answering"`
+            - `"feature-extraction"`
+            - `"fill-mask"`
+            - `"image-classification"`
+            - `"image-feature-extraction"`
+            - `"image-segmentation"`
+            - `"image-to-text"`
+            - `"image-to-image"`
+            - `"keypoint-matching"`
+            - `"object-detection"`
+            - `"question-answering"`
+            - `"summarization"`
+            - `"table-question-answering"`
+            - `"text2text-generation"`
+            - `"text-classification"` (alias `"sentiment-analysis"` available)
+            - `"text-generation"`
+            - `"text-to-audio"` (alias `"text-to-speech"` available)
+            - `"token-classification"` (alias `"ner"` available)
+            - `"translation"`
+            - `"translation_xx_to_yy"`
+            - `"video-classification"`
+            - `"visual-question-answering"` (alias `"vqa"` available)
+            - `"zero-shot-classification"`
+            - `"zero-shot-image-classification"`
+            - `"zero-shot-object-detection"`
+
+    Returns:
+        (normalized_task: `str`, task_defaults: `dict`, task_options: (`tuple`, None)) The normalized task name
+        (removed alias and options). The actual dictionary required to initialize the pipeline and some extra task
+        options for parametrized tasks like "translation_xx_to_yy"
+
+
+    """
+    return PIPELINE_REGISTRY.check_task(task)
+
+
+def clean_custom_task(task_info):
+    import transformers
+
+    if "impl" not in task_info:
+        raise RuntimeError("This model introduces a custom pipeline without specifying its implementation.")
+    pt_class_names = task_info.get("pt", ())
+    if isinstance(pt_class_names, str):
+        pt_class_names = [pt_class_names]
+    task_info["pt"] = tuple(getattr(transformers, c) for c in pt_class_names)
+    tf_class_names = task_info.get("tf", ())
+    if isinstance(tf_class_names, str):
+        tf_class_names = [tf_class_names]
+    task_info["tf"] = tuple(getattr(transformers, c) for c in tf_class_names)
+    return task_info, None
+
+
+# <generated-code>
+# fmt: off
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#                       The part of the file below was automatically generated from the code.
+#           Do NOT edit this part of the file manually as any edits will be overwritten by the generation
+#           of the file. If any change should be done, please apply the changes to the `pipeline` function
+#            below and run `python utils/check_pipeline_typing.py --fix_and_overwrite` to update the file.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+
+from typing import Literal, overload
+
+
+@overload
+def pipeline(task: Literal[None], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> Pipeline: ...
+@overload
+def pipeline(task: Literal["audio-classification"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> AudioClassificationPipeline: ...
+@overload
+def pipeline(task: Literal["automatic-speech-recognition"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> AutomaticSpeechRecognitionPipeline: ...
+@overload
+def pipeline(task: Literal["depth-estimation"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> DepthEstimationPipeline: ...
+@overload
+def pipeline(task: Literal["document-question-answering"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> DocumentQuestionAnsweringPipeline: ...
+@overload
+def pipeline(task: Literal["feature-extraction"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> FeatureExtractionPipeline: ...
+@overload
+def pipeline(task: Literal["fill-mask"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> FillMaskPipeline: ...
+@overload
+def pipeline(task: Literal["image-classification"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> ImageClassificationPipeline: ...
+@overload
+def pipeline(task: Literal["image-feature-extraction"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> ImageFeatureExtractionPipeline: ...
+@overload
+def pipeline(task: Literal["image-segmentation"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> ImageSegmentationPipeline: ...
+@overload
+def pipeline(task: Literal["image-text-to-text"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> ImageTextToTextPipeline: ...
+@overload
+def pipeline(task: Literal["image-to-image"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> ImageToImagePipeline: ...
+@overload
+def pipeline(task: Literal["image-to-text"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> ImageToTextPipeline: ...
+@overload
+def pipeline(task: Literal["keypoint-matching"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> KeypointMatchingPipeline: ...
+@overload
+def pipeline(task: Literal["mask-generation"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> MaskGenerationPipeline: ...
+@overload
+def pipeline(task: Literal["object-detection"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> ObjectDetectionPipeline: ...
+@overload
+def pipeline(task: Literal["question-answering"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> QuestionAnsweringPipeline: ...
+@overload
+def pipeline(task: Literal["summarization"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> SummarizationPipeline: ...
+@overload
+def pipeline(task: Literal["table-question-answering"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> TableQuestionAnsweringPipeline: ...
+@overload
+def pipeline(task: Literal["text-classification"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> TextClassificationPipeline: ...
+@overload
+def pipeline(task: Literal["text-generation"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> TextGenerationPipeline: ...
+@overload
+def pipeline(task: Literal["text-to-audio"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> TextToAudioPipeline: ...
+@overload
+def pipeline(task: Literal["text2text-generation"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> Text2TextGenerationPipeline: ...
+@overload
+def pipeline(task: Literal["token-classification"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> TokenClassificationPipeline: ...
+@overload
+def pipeline(task: Literal["translation"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> TranslationPipeline: ...
+@overload
+def pipeline(task: Literal["video-classification"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> VideoClassificationPipeline: ...
+@overload
+def pipeline(task: Literal["visual-question-answering"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> VisualQuestionAnsweringPipeline: ...
+@overload
+def pipeline(task: Literal["zero-shot-audio-classification"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> ZeroShotAudioClassificationPipeline: ...
+@overload
+def pipeline(task: Literal["zero-shot-classification"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> ZeroShotClassificationPipeline: ...
+@overload
+def pipeline(task: Literal["zero-shot-image-classification"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> ZeroShotImageClassificationPipeline: ...
+@overload
+def pipeline(task: Literal["zero-shot-object-detection"], model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, processor: Optional[Union[str, ProcessorMixin]] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, device: Optional[Union[int, str, "torch.device"]] = None, device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None, dtype: Optional[Union[str, "torch.dtype"]] = "auto", trust_remote_code: Optional[bool] = None, model_kwargs: Optional[dict[str, Any]] = None, pipeline_class: Optional[Any] = None, **kwargs: Any) -> ZeroShotObjectDetectionPipeline: ...
+
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#                       The part of the file above was automatically generated from the code.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# fmt: on
+# </generated-code>
+
+
+def pipeline(
+    task: Optional[str] = None,
+    model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None,
+    config: Optional[Union[str, PretrainedConfig]] = None,
+    tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None,
+    feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None,
+    image_processor: Optional[Union[str, BaseImageProcessor]] = None,
+    processor: Optional[Union[str, ProcessorMixin]] = None,
+    framework: Optional[str] = None,
+    revision: Optional[str] = None,
+    use_fast: bool = True,
+    token: Optional[Union[str, bool]] = None,
+    device: Optional[Union[int, str, "torch.device"]] = None,
+    device_map: Optional[Union[str, dict[str, Union[int, str]]]] = None,
+    dtype: Optional[Union[str, "torch.dtype"]] = "auto",
+    trust_remote_code: Optional[bool] = None,
+    model_kwargs: Optional[dict[str, Any]] = None,
+    pipeline_class: Optional[Any] = None,
+    **kwargs: Any,
+) -> Pipeline:
+    """
+    Utility factory method to build a [`Pipeline`].
+
+    A pipeline consists of:
+
+        - One or more components for pre-processing model inputs, such as a [tokenizer](tokenizer),
+        [image_processor](image_processor), [feature_extractor](feature_extractor), or [processor](processors).
+        - A [model](model) that generates predictions from the inputs.
+        - Optional post-processing steps to refine the model's output, which can also be handled by processors.
+
+    <Tip>
+    While there are such optional arguments as `tokenizer`, `feature_extractor`, `image_processor`, and `processor`,
+    they shouldn't be specified all at once. If these components are not provided, `pipeline` will try to load
+    required ones automatically. In case you want to provide these components explicitly, please refer to a
+    specific pipeline in order to get more details regarding what components are required.
+    </Tip>
+
+    Args:
+        task (`str`):
+            The task defining which pipeline will be returned. Currently accepted tasks are:
+
+            - `"audio-classification"`: will return a [`AudioClassificationPipeline`].
+            - `"automatic-speech-recognition"`: will return a [`AutomaticSpeechRecognitionPipeline`].
+            - `"depth-estimation"`: will return a [`DepthEstimationPipeline`].
+            - `"document-question-answering"`: will return a [`DocumentQuestionAnsweringPipeline`].
+            - `"feature-extraction"`: will return a [`FeatureExtractionPipeline`].
+            - `"fill-mask"`: will return a [`FillMaskPipeline`]:.
+            - `"image-classification"`: will return a [`ImageClassificationPipeline`].
+            - `"image-feature-extraction"`: will return an [`ImageFeatureExtractionPipeline`].
+            - `"image-segmentation"`: will return a [`ImageSegmentationPipeline`].
+            - `"image-text-to-text"`: will return a [`ImageTextToTextPipeline`].
+            - `"image-to-image"`: will return a [`ImageToImagePipeline`].
+            - `"image-to-text"`: will return a [`ImageToTextPipeline`].
+            - `"keypoint-matching"`: will return a [`KeypointMatchingPipeline`].
+            - `"mask-generation"`: will return a [`MaskGenerationPipeline`].
+            - `"object-detection"`: will return a [`ObjectDetectionPipeline`].
+            - `"question-answering"`: will return a [`QuestionAnsweringPipeline`].
+            - `"summarization"`: will return a [`SummarizationPipeline`].
+            - `"table-question-answering"`: will return a [`TableQuestionAnsweringPipeline`].
+            - `"text2text-generation"`: will return a [`Text2TextGenerationPipeline`].
+            - `"text-classification"` (alias `"sentiment-analysis"` available): will return a
+              [`TextClassificationPipeline`].
+            - `"text-generation"`: will return a [`TextGenerationPipeline`]:.
+            - `"text-to-audio"` (alias `"text-to-speech"` available): will return a [`TextToAudioPipeline`]:.
+            - `"token-classification"` (alias `"ner"` available): will return a [`TokenClassificationPipeline`].
+            - `"translation"`: will return a [`TranslationPipeline`].
+            - `"translation_xx_to_yy"`: will return a [`TranslationPipeline`].
+            - `"video-classification"`: will return a [`VideoClassificationPipeline`].
+            - `"visual-question-answering"`: will return a [`VisualQuestionAnsweringPipeline`].
+            - `"zero-shot-classification"`: will return a [`ZeroShotClassificationPipeline`].
+            - `"zero-shot-image-classification"`: will return a [`ZeroShotImageClassificationPipeline`].
+            - `"zero-shot-audio-classification"`: will return a [`ZeroShotAudioClassificationPipeline`].
+            - `"zero-shot-object-detection"`: will return a [`ZeroShotObjectDetectionPipeline`].
+
+        model (`str` or [`PreTrainedModel`] or [`TFPreTrainedModel`], *optional*):
+            The model that will be used by the pipeline to make predictions. This can be a model identifier or an
+            actual instance of a pretrained model inheriting from [`PreTrainedModel`] (for PyTorch) or
+            [`TFPreTrainedModel`] (for TensorFlow).
+
+            If not provided, the default for the `task` will be loaded.
+        config (`str` or [`PretrainedConfig`], *optional*):
+            The configuration that will be used by the pipeline to instantiate the model. This can be a model
+            identifier or an actual pretrained model configuration inheriting from [`PretrainedConfig`].
+
+            If not provided, the default configuration file for the requested model will be used. That means that if
+            `model` is given, its default configuration will be used. However, if `model` is not supplied, this
+            `task`'s default model's config is used instead.
+        tokenizer (`str` or [`PreTrainedTokenizer`], *optional*):
+            The tokenizer that will be used by the pipeline to encode data for the model. This can be a model
+            identifier or an actual pretrained tokenizer inheriting from [`PreTrainedTokenizer`].
+
+            If not provided, the default tokenizer for the given `model` will be loaded (if it is a string). If `model`
+            is not specified or not a string, then the default tokenizer for `config` is loaded (if it is a string).
+            However, if `config` is also not given or not a string, then the default tokenizer for the given `task`
+            will be loaded.
+        feature_extractor (`str` or [`PreTrainedFeatureExtractor`], *optional*):
+            The feature extractor that will be used by the pipeline to encode data for the model. This can be a model
+            identifier or an actual pretrained feature extractor inheriting from [`PreTrainedFeatureExtractor`].
+
+            Feature extractors are used for non-NLP models, such as Speech or Vision models as well as multi-modal
+            models. Multi-modal models will also require a tokenizer to be passed.
+
+            If not provided, the default feature extractor for the given `model` will be loaded (if it is a string). If
+            `model` is not specified or not a string, then the default feature extractor for `config` is loaded (if it
+            is a string). However, if `config` is also not given or not a string, then the default feature extractor
+            for the given `task` will be loaded.
+        image_processor (`str` or [`BaseImageProcessor`], *optional*):
+            The image processor that will be used by the pipeline to preprocess images for the model. This can be a
+            model identifier or an actual image processor inheriting from [`BaseImageProcessor`].
+
+            Image processors are used for Vision models and multi-modal models that require image inputs. Multi-modal
+            models will also require a tokenizer to be passed.
+
+            If not provided, the default image processor for the given `model` will be loaded (if it is a string). If
+            `model` is not specified or not a string, then the default image processor for `config` is loaded (if it is
+            a string).
+        processor (`str` or [`ProcessorMixin`], *optional*):
+            The processor that will be used by the pipeline to preprocess data for the model. This can be a model
+            identifier or an actual processor inheriting from [`ProcessorMixin`].
+
+            Processors are used for multi-modal models that require multi-modal inputs, for example, a model that
+            requires both text and image inputs.
+
+            If not provided, the default processor for the given `model` will be loaded (if it is a string). If `model`
+            is not specified or not a string, then the default processor for `config` is loaded (if it is a string).
+        framework (`str`, *optional*):
+            The framework to use, either `"pt"` for PyTorch or `"tf"` for TensorFlow. The specified framework must be
+            installed.
+
+            If no framework is specified, will default to the one currently installed. If no framework is specified and
+            both frameworks are installed, will default to the framework of the `model`, or to PyTorch if no model is
+            provided.
+        revision (`str`, *optional*, defaults to `"main"`):
+            When passing a task name or a string model identifier: The specific model version to use. It can be a
+            branch name, a tag name, or a commit id, since we use a git-based system for storing models and other
+            artifacts on huggingface.co, so `revision` can be any identifier allowed by git.
+        use_fast (`bool`, *optional*, defaults to `True`):
+            Whether or not to use a Fast tokenizer if possible (a [`PreTrainedTokenizerFast`]).
+        use_auth_token (`str` or *bool*, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `hf auth login` (stored in `~/.huggingface`).
+        device (`int` or `str` or `torch.device`):
+            Defines the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank like `1`) on which this
+            pipeline will be allocated.
+        device_map (`str` or `dict[str, Union[int, str, torch.device]`, *optional*):
+            Sent directly as `model_kwargs` (just a simpler shortcut). When `accelerate` library is present, set
+            `device_map="auto"` to compute the most optimized `device_map` automatically (see
+            [here](https://huggingface.co/docs/accelerate/main/en/package_reference/big_modeling#accelerate.cpu_offload)
+            for more information).
+
+            <Tip warning={true}>
+
+            Do not use `device_map` AND `device` at the same time as they will conflict
+
+            </Tip>
+
+        dtype (`str` or `torch.dtype`, *optional*):
+            Sent directly as `model_kwargs` (just a simpler shortcut) to use the available precision for this model
+            (`torch.float16`, `torch.bfloat16`, ... or `"auto"`).
+        trust_remote_code (`bool`, *optional*, defaults to `False`):
+            Whether or not to allow for custom code defined on the Hub in their own modeling, configuration,
+            tokenization or even pipeline files. This option should only be set to `True` for repositories you trust
+            and in which you have read the code, as it will execute code present on the Hub on your local machine.
+        model_kwargs (`dict[str, Any]`, *optional*):
+            Additional dictionary of keyword arguments passed along to the model's `from_pretrained(...,
+            **model_kwargs)` function.
+        kwargs (`dict[str, Any]`, *optional*):
+            Additional keyword arguments passed along to the specific pipeline init (see the documentation for the
+            corresponding pipeline class for possible values).
+
+    Returns:
+        [`Pipeline`]: A suitable pipeline for the task.
+
+    Examples:
+
+    ```python
+    >>> from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
+
+    >>> # Sentiment analysis pipeline
+    >>> analyzer = pipeline("sentiment-analysis")
+
+    >>> # Question answering pipeline, specifying the checkpoint identifier
+    >>> oracle = pipeline(
+    ...     "question-answering", model="distilbert/distilbert-base-cased-distilled-squad", tokenizer="google-bert/bert-base-cased"
+    ... )
+
+    >>> # Named entity recognition pipeline, passing in a specific model and tokenizer
+    >>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
+    >>> recognizer = pipeline("ner", model=model, tokenizer=tokenizer)
+    ```"""
+    if model_kwargs is None:
+        model_kwargs = {}
+    # Make sure we only pass use_auth_token once as a kwarg (it used to be possible to pass it in model_kwargs,
+    # this is to keep BC).
+    use_auth_token = model_kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+            FutureWarning,
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
+    code_revision = kwargs.pop("code_revision", None)
+    commit_hash = kwargs.pop("_commit_hash", None)
+
+    hub_kwargs = {
+        "revision": revision,
+        "token": token,
+        "trust_remote_code": trust_remote_code,
+        "_commit_hash": commit_hash,
+    }
+
+    if task is None and model is None:
+        raise RuntimeError(
+            "Impossible to instantiate a pipeline without either a task or a model "
+            "being specified. "
+            "Please provide a task class or a model"
+        )
+
+    if model is None and tokenizer is not None:
+        raise RuntimeError(
+            "Impossible to instantiate a pipeline with tokenizer specified but not the model as the provided tokenizer"
+            " may not be compatible with the default model. Please provide a PreTrainedModel class or a"
+            " path/identifier to a pretrained model when providing tokenizer."
+        )
+    if model is None and feature_extractor is not None:
+        raise RuntimeError(
+            "Impossible to instantiate a pipeline with feature_extractor specified but not the model as the provided"
+            " feature_extractor may not be compatible with the default model. Please provide a PreTrainedModel class"
+            " or a path/identifier to a pretrained model when providing feature_extractor."
+        )
+    if isinstance(model, Path):
+        model = str(model)
+
+    if commit_hash is None:
+        pretrained_model_name_or_path = None
+        if isinstance(config, str):
+            pretrained_model_name_or_path = config
+        elif config is None and isinstance(model, str):
+            pretrained_model_name_or_path = model
+
+        if not isinstance(config, PretrainedConfig) and pretrained_model_name_or_path is not None:
+            # We make a call to the config file first (which may be absent) to get the commit hash as soon as possible
+            resolved_config_file = cached_file(
+                pretrained_model_name_or_path,
+                CONFIG_NAME,
+                _raise_exceptions_for_gated_repo=False,
+                _raise_exceptions_for_missing_entries=False,
+                _raise_exceptions_for_connection_errors=False,
+                cache_dir=model_kwargs.get("cache_dir"),
+                **hub_kwargs,
+            )
+            hub_kwargs["_commit_hash"] = extract_commit_hash(resolved_config_file, commit_hash)
+        else:
+            hub_kwargs["_commit_hash"] = getattr(config, "_commit_hash", None)
+
+    # Config is the primordial information item.
+    # Instantiate config if needed
+    adapter_path = None
+    if isinstance(config, str):
+        config = AutoConfig.from_pretrained(
+            config, _from_pipeline=task, code_revision=code_revision, **hub_kwargs, **model_kwargs
+        )
+        hub_kwargs["_commit_hash"] = config._commit_hash
+    elif config is None and isinstance(model, str):
+        # Check for an adapter file in the model path if PEFT is available
+        if is_peft_available():
+            # `find_adapter_config_file` doesn't accept `trust_remote_code`
+            _hub_kwargs = {k: v for k, v in hub_kwargs.items() if k != "trust_remote_code"}
+            maybe_adapter_path = find_adapter_config_file(
+                model,
+                token=hub_kwargs["token"],
+                revision=hub_kwargs["revision"],
+                _commit_hash=hub_kwargs["_commit_hash"],
+            )
+
+            if maybe_adapter_path is not None:
+                with open(maybe_adapter_path, "r", encoding="utf-8") as f:
+                    adapter_config = json.load(f)
+                    adapter_path = model
+                    model = adapter_config["base_model_name_or_path"]
+
+        config = AutoConfig.from_pretrained(
+            model, _from_pipeline=task, code_revision=code_revision, **hub_kwargs, **model_kwargs
+        )
+        hub_kwargs["_commit_hash"] = config._commit_hash
+
+    custom_tasks = {}
+    if config is not None and len(getattr(config, "custom_pipelines", {})) > 0:
+        custom_tasks = config.custom_pipelines
+        if task is None and trust_remote_code is not False:
+            if len(custom_tasks) == 1:
+                task = list(custom_tasks.keys())[0]
+            else:
+                raise RuntimeError(
+                    "We can't infer the task automatically for this model as there are multiple tasks available. Pick "
+                    f"one in {', '.join(custom_tasks.keys())}"
+                )
+
+    if task is None and model is not None:
+        if not isinstance(model, str):
+            raise RuntimeError(
+                "Inferring the task automatically requires to check the hub with a model_id defined as a `str`. "
+                f"{model} is not a valid model_id."
+            )
+        task = get_task(model, token)
+
+    # Retrieve the task
+    if task in custom_tasks:
+        targeted_task, task_options = clean_custom_task(custom_tasks[task])
+        if pipeline_class is None:
+            if not trust_remote_code:
+                raise ValueError(
+                    "Loading this pipeline requires you to execute the code in the pipeline file in that"
+                    " repo on your local machine. Make sure you have read the code there to avoid malicious use, then"
+                    " set the option `trust_remote_code=True` to remove this error."
+                )
+            class_ref = targeted_task["impl"]
+            pipeline_class = get_class_from_dynamic_module(
+                class_ref,
+                model,
+                code_revision=code_revision,
+                **hub_kwargs,
+            )
+    else:
+        normalized_task, targeted_task, task_options = check_task(task)
+        if pipeline_class is None:
+            pipeline_class = targeted_task["impl"]
+
+    # Use default model/config/tokenizer for the task if no model is provided
+    if model is None:
+        # At that point framework might still be undetermined
+        model, default_revision = get_default_model_and_revision(targeted_task, framework, task_options)
+        revision = revision if revision is not None else default_revision
+        logger.warning(
+            f"No model was supplied, defaulted to {model} and revision"
+            f" {revision} ({HUGGINGFACE_CO_RESOLVE_ENDPOINT}/{model}).\n"
+            "Using a pipeline without specifying a model name and revision in production is not recommended."
+        )
+        hub_kwargs["revision"] = revision
+        if config is None and isinstance(model, str):
+            config = AutoConfig.from_pretrained(model, _from_pipeline=task, **hub_kwargs, **model_kwargs)
+            hub_kwargs["_commit_hash"] = config._commit_hash
+
+    if device_map is not None:
+        if "device_map" in model_kwargs:
+            raise ValueError(
+                'You cannot use both `pipeline(... device_map=..., model_kwargs={"device_map":...})` as those'
+                " arguments might conflict, use only one.)"
+            )
+        if device is not None:
+            logger.warning(
+                "Both `device` and `device_map` are specified. `device` will override `device_map`. You"
+                " will most likely encounter unexpected behavior. Please remove `device` and keep `device_map`."
+            )
+        model_kwargs["device_map"] = device_map
+
+    # BC for the `torch_dtype` argument
+    if (torch_dtype := kwargs.get("torch_dtype")) is not None:
+        logger.warning_once("`torch_dtype` is deprecated! Use `dtype` instead!")
+        # If both are provided, keep `dtype`
+        dtype = torch_dtype if dtype == "auto" else dtype
+    if "torch_dtype" in model_kwargs or "dtype" in model_kwargs:
+        if "torch_dtype" in model_kwargs:
+            logger.warning_once("`torch_dtype` is deprecated! Use `dtype` instead!")
+        # If the user did not explicitly provide `dtype` (i.e. the function default "auto" is still
+        # present) but a value is supplied inside `model_kwargs`, we silently defer to the latter instead of
+        # raising. This prevents false positives like providing `dtype` only via `model_kwargs` while the
+        # top-level argument keeps its default value "auto".
+        if dtype == "auto":
+            dtype = None
+        else:
+            raise ValueError(
+                'You cannot use both `pipeline(... dtype=..., model_kwargs={"dtype":...})` as those'
+                " arguments might conflict, use only one.)"
+            )
+    if dtype is not None:
+        if isinstance(dtype, str) and hasattr(torch, dtype):
+            dtype = getattr(torch, dtype)
+        model_kwargs["dtype"] = dtype
+
+    model_name = model if isinstance(model, str) else None
+
+    # Load the correct model if possible
+    # Infer the framework from the model if not already defined
+    if isinstance(model, str) or framework is None:
+        model_classes = {"tf": targeted_task["tf"], "pt": targeted_task["pt"]}
+        framework, model = infer_framework_load_model(
+            adapter_path if adapter_path is not None else model,
+            model_classes=model_classes,
+            config=config,
+            framework=framework,
+            task=task,
+            **hub_kwargs,
+            **model_kwargs,
+        )
+
+    hub_kwargs["_commit_hash"] = model.config._commit_hash
+
+    # Check which preprocessing classes the pipeline uses
+    # None values indicate optional classes that the pipeline can run without, we don't raise errors if loading fails
+    load_tokenizer = pipeline_class._load_tokenizer
+    load_feature_extractor = pipeline_class._load_feature_extractor
+    load_image_processor = pipeline_class._load_image_processor
+    load_processor = pipeline_class._load_processor
+
+    if load_tokenizer or load_tokenizer is None:
+        try:
+            # Try to infer tokenizer from model or config name (if provided as str)
+            if tokenizer is None:
+                if isinstance(model_name, str):
+                    tokenizer = model_name
+                elif isinstance(config, str):
+                    tokenizer = config
+                else:
+                    # Impossible to guess what is the right tokenizer here
+                    raise Exception(
+                        "Impossible to guess which tokenizer to use. "
+                        "Please provide a PreTrainedTokenizer class or a path/identifier to a pretrained tokenizer."
+                    )
+
+            # Instantiate tokenizer if needed
+            if isinstance(tokenizer, (str, tuple)):
+                if isinstance(tokenizer, tuple):
+                    # For tuple we have (tokenizer name, {kwargs})
+                    use_fast = tokenizer[1].pop("use_fast", use_fast)
+                    tokenizer_identifier = tokenizer[0]
+                    tokenizer_kwargs = tokenizer[1]
+                else:
+                    tokenizer_identifier = tokenizer
+                    tokenizer_kwargs = model_kwargs.copy()
+                    tokenizer_kwargs.pop("torch_dtype", None), tokenizer_kwargs.pop("dtype", None)
+
+                tokenizer = AutoTokenizer.from_pretrained(
+                    tokenizer_identifier, use_fast=use_fast, _from_pipeline=task, **hub_kwargs, **tokenizer_kwargs
+                )
+        except Exception as e:
+            if load_tokenizer:
+                raise e
+            else:
+                tokenizer = None
+
+    if load_image_processor or load_image_processor is None:
+        try:
+            # Try to infer image processor from model or config name (if provided as str)
+            if image_processor is None:
+                if isinstance(model_name, str):
+                    image_processor = model_name
+                elif isinstance(config, str):
+                    image_processor = config
+                # Backward compatibility, as `feature_extractor` used to be the name
+                # for `ImageProcessor`.
+                elif feature_extractor is not None and isinstance(feature_extractor, BaseImageProcessor):
+                    image_processor = feature_extractor
+                else:
+                    # Impossible to guess what is the right image_processor here
+                    raise Exception(
+                        "Impossible to guess which image processor to use. "
+                        "Please provide a PreTrainedImageProcessor class or a path/identifier "
+                        "to a pretrained image processor."
+                    )
+
+            # Instantiate image_processor if needed
+            if isinstance(image_processor, (str, tuple)):
+                image_processor = AutoImageProcessor.from_pretrained(
+                    image_processor, _from_pipeline=task, **hub_kwargs, **model_kwargs
+                )
+        except Exception as e:
+            if load_image_processor:
+                raise e
+            else:
+                image_processor = None
+
+    if load_feature_extractor or load_feature_extractor is None:
+        try:
+            # Try to infer feature extractor from model or config name (if provided as str)
+            if feature_extractor is None:
+                if isinstance(model_name, str):
+                    feature_extractor = model_name
+                elif isinstance(config, str):
+                    feature_extractor = config
+                else:
+                    # Impossible to guess what is the right feature_extractor here
+                    raise Exception(
+                        "Impossible to guess which feature extractor to use. "
+                        "Please provide a PreTrainedFeatureExtractor class or a path/identifier "
+                        "to a pretrained feature extractor."
+                    )
+
+            # Instantiate feature_extractor if needed
+            if isinstance(feature_extractor, (str, tuple)):
+                feature_extractor = AutoFeatureExtractor.from_pretrained(
+                    feature_extractor, _from_pipeline=task, **hub_kwargs, **model_kwargs
+                )
+
+                if (
+                    feature_extractor._processor_class
+                    and feature_extractor._processor_class.endswith("WithLM")
+                    and isinstance(model_name, str)
+                ):
+                    try:
+                        import kenlm  # to trigger `ImportError` if not installed
+                        from pyctcdecode import BeamSearchDecoderCTC
+
+                        if os.path.isdir(model_name) or os.path.isfile(model_name):
+                            decoder = BeamSearchDecoderCTC.load_from_dir(model_name)
+                        else:
+                            language_model_glob = os.path.join(
+                                BeamSearchDecoderCTC._LANGUAGE_MODEL_SERIALIZED_DIRECTORY, "*"
+                            )
+                            alphabet_filename = BeamSearchDecoderCTC._ALPHABET_SERIALIZED_FILENAME
+                            allow_patterns = [language_model_glob, alphabet_filename]
+                            decoder = BeamSearchDecoderCTC.load_from_hf_hub(model_name, allow_patterns=allow_patterns)
+
+                        kwargs["decoder"] = decoder
+                    except ImportError as e:
+                        logger.warning(
+                            f"Could not load the `decoder` for {model_name}. Defaulting to raw CTC. Error: {e}"
+                        )
+                        if not is_kenlm_available():
+                            logger.warning("Try to install `kenlm`: `pip install kenlm")
+
+                        if not is_pyctcdecode_available():
+                            logger.warning("Try to install `pyctcdecode`: `pip install pyctcdecode")
+        except Exception as e:
+            if load_feature_extractor:
+                raise e
+            else:
+                feature_extractor = None
+
+    if load_processor or load_processor is None:
+        try:
+            # Try to infer processor from model or config name (if provided as str)
+            if processor is None:
+                if isinstance(model_name, str):
+                    processor = model_name
+                elif isinstance(config, str):
+                    processor = config
+                else:
+                    # Impossible to guess what is the right processor here
+                    raise Exception(
+                        "Impossible to guess which processor to use. "
+                        "Please provide a processor instance or a path/identifier "
+                        "to a processor."
+                    )
+
+            # Instantiate processor if needed
+            if isinstance(processor, (str, tuple)):
+                processor = AutoProcessor.from_pretrained(processor, _from_pipeline=task, **hub_kwargs, **model_kwargs)
+                if not isinstance(processor, ProcessorMixin):
+                    raise TypeError(
+                        "Processor was loaded, but it is not an instance of `ProcessorMixin`. "
+                        f"Got type `{type(processor)}` instead. Please check that you specified "
+                        "correct pipeline task for the model and model has processor implemented and saved."
+                    )
+        except Exception as e:
+            if load_processor:
+                raise e
+            else:
+                processor = None
+
+    if task == "translation" and model.config.task_specific_params:
+        for key in model.config.task_specific_params:
+            if key.startswith("translation"):
+                task = key
+                warnings.warn(
+                    f'"translation" task was used, instead of "translation_XX_to_YY", defaulting to "{task}"',
+                    UserWarning,
+                )
+                break
+
+    if tokenizer is not None:
+        kwargs["tokenizer"] = tokenizer
+
+    if feature_extractor is not None:
+        kwargs["feature_extractor"] = feature_extractor
+
+    if dtype is not None:
+        kwargs["dtype"] = dtype
+
+    if image_processor is not None:
+        kwargs["image_processor"] = image_processor
+
+    if device is not None:
+        kwargs["device"] = device
+
+    if processor is not None:
+        kwargs["processor"] = processor
+
+    return pipeline_class(model=model, framework=framework, task=task, **kwargs)
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/audio_classification.py b/venv/lib/python3.13/site-packages/transformers/pipelines/audio_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f4822e2b2be884904d241c09b5c3cc1c2ee39b8
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/audio_classification.py
@@ -0,0 +1,262 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import subprocess
+from typing import Any, Union
+
+import numpy as np
+import requests
+
+from ..utils import add_end_docstrings, is_torch_available, is_torchaudio_available, is_torchcodec_available, logging
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.ndarray:
+    """
+    Helper function to read an audio file through ffmpeg.
+    """
+    ar = f"{sampling_rate}"
+    ac = "1"
+    format_for_conversion = "f32le"
+    ffmpeg_command = [
+        "ffmpeg",
+        "-i",
+        "pipe:0",
+        "-ac",
+        ac,
+        "-ar",
+        ar,
+        "-f",
+        format_for_conversion,
+        "-hide_banner",
+        "-loglevel",
+        "quiet",
+        "pipe:1",
+    ]
+
+    try:
+        ffmpeg_process = subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+    except FileNotFoundError:
+        raise ValueError("ffmpeg was not found but is required to load audio files from filename")
+    output_stream = ffmpeg_process.communicate(bpayload)
+    out_bytes = output_stream[0]
+
+    audio = np.frombuffer(out_bytes, np.float32)
+    if audio.shape[0] == 0:
+        raise ValueError("Malformed soundfile")
+    return audio
+
+
+@add_end_docstrings(build_pipeline_init_args(has_feature_extractor=True))
+class AudioClassificationPipeline(Pipeline):
+    """
+    Audio classification pipeline using any `AutoModelForAudioClassification`. This pipeline predicts the class of a
+    raw waveform or an audio file. In case of an audio file, ffmpeg should be installed to support multiple audio
+    formats.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> classifier = pipeline(model="superb/wav2vec2-base-superb-ks")
+    >>> classifier("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac")
+    [{'score': 0.997, 'label': '_unknown_'}, {'score': 0.002, 'label': 'left'}, {'score': 0.0, 'label': 'yes'}, {'score': 0.0, 'label': 'down'}, {'score': 0.0, 'label': 'stop'}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+
+    This pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"audio-classification"`.
+
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=audio-classification).
+    """
+
+    _load_processor = False
+    _load_image_processor = False
+    _load_feature_extractor = True
+    _load_tokenizer = False
+
+    def __init__(self, *args, **kwargs):
+        # Only set default top_k if explicitly provided
+        if "top_k" in kwargs and kwargs["top_k"] is None:
+            kwargs["top_k"] = None
+        elif "top_k" not in kwargs:
+            kwargs["top_k"] = 5
+        super().__init__(*args, **kwargs)
+
+        if self.framework != "pt":
+            raise ValueError(f"The {self.__class__} is only available in PyTorch.")
+
+        self.check_model_type(MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES)
+
+    def __call__(self, inputs: Union[np.ndarray, bytes, str, dict], **kwargs: Any) -> list[dict[str, Any]]:
+        """
+        Classify the sequence(s) given as inputs. See the [`AutomaticSpeechRecognitionPipeline`] documentation for more
+        information.
+
+        Args:
+            inputs (`np.ndarray` or `bytes` or `str` or `dict`):
+                The inputs is either :
+                    - `str` that is the filename of the audio file, the file will be read at the correct sampling rate
+                      to get the waveform using *ffmpeg*. This requires *ffmpeg* to be installed on the system.
+                    - `bytes` it is supposed to be the content of an audio file and is interpreted by *ffmpeg* in the
+                      same way.
+                    - (`np.ndarray` of shape (n, ) of type `np.float32` or `np.float64`)
+                        Raw audio at the correct sampling rate (no further check will be done)
+                    - `dict` form can be used to pass raw audio sampled at arbitrary `sampling_rate` and let this
+                      pipeline do the resampling. The dict must be either be in the format `{"sampling_rate": int,
+                      "raw": np.array}`, or `{"sampling_rate": int, "array": np.array}`, where the key `"raw"` or
+                      `"array"` is used to denote the raw audio waveform.
+            top_k (`int`, *optional*, defaults to None):
+                The number of top labels that will be returned by the pipeline. If the provided number is `None` or
+                higher than the number of labels available in the model configuration, it will default to the number of
+                labels.
+            function_to_apply(`str`, *optional*, defaults to "softmax"):
+                The function to apply to the model output. By default, the pipeline will apply the softmax function to
+                the output of the model. Valid options: ["softmax", "sigmoid", "none"]. Note that passing Python's
+                built-in `None` will default to "softmax", so you need to pass the string "none" to disable any
+                post-processing.
+
+        Return:
+            A list of `dict` with the following keys:
+
+            - **label** (`str`) -- The label predicted.
+            - **score** (`float`) -- The corresponding probability.
+        """
+        return super().__call__(inputs, **kwargs)
+
+    def _sanitize_parameters(self, top_k=None, function_to_apply=None, **kwargs):
+        postprocess_params = {}
+
+        # If top_k is None, use all labels
+        if top_k is None:
+            postprocess_params["top_k"] = self.model.config.num_labels
+        else:
+            if top_k > self.model.config.num_labels:
+                top_k = self.model.config.num_labels
+            postprocess_params["top_k"] = top_k
+
+        if function_to_apply is not None:
+            if function_to_apply not in ["softmax", "sigmoid", "none"]:
+                raise ValueError(
+                    f"Invalid value for `function_to_apply`: {function_to_apply}. "
+                    "Valid options are ['softmax', 'sigmoid', 'none']"
+                )
+            postprocess_params["function_to_apply"] = function_to_apply
+        else:
+            postprocess_params["function_to_apply"] = "softmax"
+        return {}, {}, postprocess_params
+
+    def preprocess(self, inputs):
+        if isinstance(inputs, str):
+            if inputs.startswith("http://") or inputs.startswith("https://"):
+                # We need to actually check for a real protocol, otherwise it's impossible to use a local file
+                # like http_huggingface_co.png
+                inputs = requests.get(inputs).content
+            else:
+                with open(inputs, "rb") as f:
+                    inputs = f.read()
+
+        if isinstance(inputs, bytes):
+            inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate)
+
+        if is_torch_available():
+            import torch
+
+            if isinstance(inputs, torch.Tensor):
+                inputs = inputs.cpu().numpy()
+
+        if is_torchcodec_available():
+            import torch
+            import torchcodec
+
+            if isinstance(inputs, torchcodec.decoders.AudioDecoder):
+                _audio_samples = inputs.get_all_samples()
+                _array = _audio_samples.data
+                inputs = {"array": _array, "sampling_rate": _audio_samples.sample_rate}
+
+        if isinstance(inputs, dict):
+            inputs = inputs.copy()  # So we don't mutate the original dictionary outside the pipeline
+            # Accepting `"array"` which is the key defined in `datasets` for
+            # better integration
+            if not ("sampling_rate" in inputs and ("raw" in inputs or "array" in inputs)):
+                raise ValueError(
+                    "When passing a dictionary to AudioClassificationPipeline, the dict needs to contain a "
+                    '"raw" key containing the numpy array or torch tensor representing the audio and a "sampling_rate" key, '
+                    "containing the sampling_rate associated with that array"
+                )
+
+            _inputs = inputs.pop("raw", None)
+            if _inputs is None:
+                # Remove path which will not be used from `datasets`.
+                inputs.pop("path", None)
+                _inputs = inputs.pop("array", None)
+            in_sampling_rate = inputs.pop("sampling_rate")
+            inputs = _inputs
+            if in_sampling_rate != self.feature_extractor.sampling_rate:
+                import torch
+
+                if is_torchaudio_available():
+                    from torchaudio import functional as F
+                else:
+                    raise ImportError(
+                        "torchaudio is required to resample audio samples in AudioClassificationPipeline. "
+                        "The torchaudio package can be installed through: `pip install torchaudio`."
+                    )
+
+                inputs = F.resample(
+                    torch.from_numpy(inputs) if isinstance(inputs, np.ndarray) else inputs,
+                    in_sampling_rate,
+                    self.feature_extractor.sampling_rate,
+                ).numpy()
+
+        if not isinstance(inputs, np.ndarray):
+            raise TypeError("We expect a numpy ndarray or torch tensor as input")
+        if len(inputs.shape) != 1:
+            raise ValueError("We expect a single channel audio input for AudioClassificationPipeline")
+
+        processed = self.feature_extractor(
+            inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt"
+        )
+        if self.dtype is not None:
+            processed = processed.to(dtype=self.dtype)
+        return processed
+
+    def _forward(self, model_inputs):
+        model_outputs = self.model(**model_inputs)
+        return model_outputs
+
+    def postprocess(self, model_outputs, top_k=5, function_to_apply="softmax"):
+        if function_to_apply == "softmax":
+            probs = model_outputs.logits[0].softmax(-1)
+        elif function_to_apply == "sigmoid":
+            probs = model_outputs.logits[0].sigmoid()
+        else:
+            probs = model_outputs.logits[0]
+        scores, ids = probs.topk(top_k)
+
+        scores = scores.tolist()
+        ids = ids.tolist()
+
+        labels = [{"score": score, "label": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)]
+
+        return labels
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/audio_utils.py b/venv/lib/python3.13/site-packages/transformers/pipelines/audio_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..dad6f96945203963d0f1c57614553d6a871e3fd4
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/audio_utils.py
@@ -0,0 +1,297 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+import datetime
+import platform
+import subprocess
+from typing import Optional, Union
+
+import numpy as np
+
+
+def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.ndarray:
+    """
+    Helper function to read an audio file through ffmpeg.
+    """
+    ar = f"{sampling_rate}"
+    ac = "1"
+    format_for_conversion = "f32le"
+    ffmpeg_command = [
+        "ffmpeg",
+        "-i",
+        "pipe:0",
+        "-ac",
+        ac,
+        "-ar",
+        ar,
+        "-f",
+        format_for_conversion,
+        "-hide_banner",
+        "-loglevel",
+        "quiet",
+        "pipe:1",
+    ]
+
+    try:
+        with subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) as ffmpeg_process:
+            output_stream = ffmpeg_process.communicate(bpayload)
+    except FileNotFoundError as error:
+        raise ValueError("ffmpeg was not found but is required to load audio files from filename") from error
+    out_bytes = output_stream[0]
+    audio = np.frombuffer(out_bytes, np.float32)
+    if audio.shape[0] == 0:
+        raise ValueError(
+            "Soundfile is either not in the correct format or is malformed. Ensure that the soundfile has "
+            "a valid audio file extension (e.g. wav, flac or mp3) and is not corrupted. If reading from a remote "
+            "URL, ensure that the URL is the full address to **download** the audio file."
+        )
+    return audio
+
+
+def ffmpeg_microphone(
+    sampling_rate: int,
+    chunk_length_s: float,
+    format_for_conversion: str = "f32le",
+    ffmpeg_input_device: Optional[str] = None,
+    ffmpeg_additional_args: Optional[list[str]] = None,
+):
+    """
+    Helper function to read audio from a microphone using ffmpeg. The default input device will be used unless another
+    input device is specified using the `ffmpeg_input_device` argument. Uses 'alsa' on Linux, 'avfoundation' on MacOS and
+    'dshow' on Windows.
+
+    Arguments:
+        sampling_rate (`int`):
+            The sampling_rate to use when reading the data from the microphone. Try using the model's sampling_rate to
+            avoid resampling later.
+        chunk_length_s (`float` or `int`):
+            The length of the maximum chunk of audio to be sent returned.
+        format_for_conversion (`str`, defaults to `f32le`):
+            The name of the format of the audio samples to be returned by ffmpeg. The standard is `f32le`, `s16le`
+            could also be used.
+        ffmpeg_input_device (`str`, *optional*):
+            The identifier of the input device to be used by ffmpeg (i.e. ffmpeg's '-i' argument). If unset,
+            the default input device will be used. See `https://www.ffmpeg.org/ffmpeg-devices.html#Input-Devices`
+            for how to specify and list input devices.
+        ffmpeg_additional_args (`list[str]`, *optional*):
+            Additional arguments to pass to ffmpeg, can include arguments like -nostdin for running as a background
+            process. For example, to pass -nostdin to the ffmpeg process, pass in ["-nostdin"]. If passing in flags
+            with multiple arguments, use the following convention (eg ["flag", "arg1", "arg2]).
+
+    Returns:
+        A generator yielding audio chunks of `chunk_length_s` seconds as `bytes` objects of length
+        `int(round(sampling_rate * chunk_length_s)) * size_of_sample`.
+    """
+    ar = f"{sampling_rate}"
+    ac = "1"
+    if format_for_conversion == "s16le":
+        size_of_sample = 2
+    elif format_for_conversion == "f32le":
+        size_of_sample = 4
+    else:
+        raise ValueError(f"Unhandled format `{format_for_conversion}`. Please use `s16le` or `f32le`")
+
+    system = platform.system()
+
+    if system == "Linux":
+        format_ = "alsa"
+        input_ = ffmpeg_input_device or "default"
+    elif system == "Darwin":
+        format_ = "avfoundation"
+        input_ = ffmpeg_input_device or ":default"
+    elif system == "Windows":
+        format_ = "dshow"
+        input_ = ffmpeg_input_device or _get_microphone_name()
+
+    ffmpeg_additional_args = [] if ffmpeg_additional_args is None else ffmpeg_additional_args
+
+    ffmpeg_command = [
+        "ffmpeg",
+        "-f",
+        format_,
+        "-i",
+        input_,
+        "-ac",
+        ac,
+        "-ar",
+        ar,
+        "-f",
+        format_for_conversion,
+        "-fflags",
+        "nobuffer",
+        "-hide_banner",
+        "-loglevel",
+        "quiet",
+        "pipe:1",
+    ]
+
+    ffmpeg_command.extend(ffmpeg_additional_args)
+
+    chunk_len = int(round(sampling_rate * chunk_length_s)) * size_of_sample
+    iterator = _ffmpeg_stream(ffmpeg_command, chunk_len)
+    for item in iterator:
+        yield item
+
+
+def ffmpeg_microphone_live(
+    sampling_rate: int,
+    chunk_length_s: float,
+    stream_chunk_s: Optional[int] = None,
+    stride_length_s: Optional[Union[tuple[float, float], float]] = None,
+    format_for_conversion: str = "f32le",
+    ffmpeg_input_device: Optional[str] = None,
+    ffmpeg_additional_args: Optional[list[str]] = None,
+):
+    """
+    Helper function to read audio from a microphone using ffmpeg. This will output `partial` overlapping chunks starting
+    from `stream_chunk_s` (if it is defined) until `chunk_length_s` is reached. It will make use of striding to avoid
+    errors on the "sides" of the various chunks. The default input device will be used unless another input device is
+    specified using the `ffmpeg_input_device` argument. Uses 'alsa' on Linux, 'avfoundation' on MacOS and 'dshow' on Windows.
+
+    Arguments:
+        sampling_rate (`int`):
+            The sampling_rate to use when reading the data from the microphone. Try using the model's sampling_rate to
+            avoid resampling later.
+        chunk_length_s (`float` or `int`):
+            The length of the maximum chunk of audio to be sent returned. This includes the eventual striding.
+        stream_chunk_s (`float` or `int`):
+            The length of the minimal temporary audio to be returned.
+        stride_length_s (`float` or `int` or `(float, float)`, *optional*):
+            The length of the striding to be used. Stride is used to provide context to a model on the (left, right) of
+            an audio sample but without using that part to actually make the prediction. Setting this does not change
+            the length of the chunk.
+        format_for_conversion (`str`, *optional*, defaults to `f32le`):
+            The name of the format of the audio samples to be returned by ffmpeg. The standard is `f32le`, `s16le`
+            could also be used.
+        ffmpeg_input_device (`str`, *optional*):
+            The identifier of the input device to be used by ffmpeg (i.e. ffmpeg's '-i' argument). If unset,
+            the default input device will be used. See `https://www.ffmpeg.org/ffmpeg-devices.html#Input-Devices`
+            for how to specify and list input devices.
+        ffmpeg_additional_args (`list[str]`, *optional*):
+            Additional arguments to pass to ffmpeg, can include arguments like -nostdin for running as a background
+            process. For example, to pass -nostdin to the ffmpeg process, pass in ["-nostdin"]. If passing in flags
+            with multiple arguments, use the following convention (eg ["flag", "arg1", "arg2]).
+
+    Return:
+        A generator yielding dictionaries of the following form
+
+        `{"sampling_rate": int, "raw": np.ndarray, "partial" bool}` With optionally a `"stride" (int, int)` key if
+        `stride_length_s` is defined.
+
+        `stride` and `raw` are all expressed in `samples`, and `partial` is a boolean saying if the current yield item
+        is a whole chunk, or a partial temporary result to be later replaced by another larger chunk.
+    """
+    if stream_chunk_s is not None:
+        chunk_s = stream_chunk_s
+    else:
+        chunk_s = chunk_length_s
+
+    microphone = ffmpeg_microphone(
+        sampling_rate,
+        chunk_s,
+        format_for_conversion=format_for_conversion,
+        ffmpeg_input_device=ffmpeg_input_device,
+        ffmpeg_additional_args=[] if ffmpeg_additional_args is None else ffmpeg_additional_args,
+    )
+
+    if format_for_conversion == "s16le":
+        dtype = np.int16
+        size_of_sample = 2
+    elif format_for_conversion == "f32le":
+        dtype = np.float32
+        size_of_sample = 4
+    else:
+        raise ValueError(f"Unhandled format `{format_for_conversion}`. Please use `s16le` or `f32le`")
+
+    if stride_length_s is None:
+        stride_length_s = chunk_length_s / 6
+    chunk_len = int(round(sampling_rate * chunk_length_s)) * size_of_sample
+    if isinstance(stride_length_s, (int, float)):
+        stride_length_s = [stride_length_s, stride_length_s]
+
+    stride_left = int(round(sampling_rate * stride_length_s[0])) * size_of_sample
+    stride_right = int(round(sampling_rate * stride_length_s[1])) * size_of_sample
+    audio_time = datetime.datetime.now()
+    delta = datetime.timedelta(seconds=chunk_s)
+    for item in chunk_bytes_iter(microphone, chunk_len, stride=(stride_left, stride_right), stream=True):
+        # Put everything back in numpy scale
+        item["raw"] = np.frombuffer(item["raw"], dtype=dtype)
+        item["stride"] = (
+            item["stride"][0] // size_of_sample,
+            item["stride"][1] // size_of_sample,
+        )
+        item["sampling_rate"] = sampling_rate
+        audio_time += delta
+        if datetime.datetime.now() > audio_time + 10 * delta:
+            # We're late !! SKIP
+            continue
+        yield item
+
+
+def chunk_bytes_iter(iterator, chunk_len: int, stride: tuple[int, int], stream: bool = False):
+    """
+    Reads raw bytes from an iterator and does chunks of length `chunk_len`. Optionally adds `stride` to each chunks to
+    get overlaps. `stream` is used to return partial results even if a full `chunk_len` is not yet available.
+    """
+    acc = b""
+    stride_left, stride_right = stride
+    if stride_left + stride_right >= chunk_len:
+        raise ValueError(
+            f"Stride needs to be strictly smaller than chunk_len: ({stride_left}, {stride_right}) vs {chunk_len}"
+        )
+    _stride_left = 0
+    for raw in iterator:
+        acc += raw
+        if stream and len(acc) < chunk_len:
+            stride = (_stride_left, 0)
+            yield {"raw": acc[:chunk_len], "stride": stride, "partial": True}
+        else:
+            while len(acc) >= chunk_len:
+                # We are flushing the accumulator
+                stride = (_stride_left, stride_right)
+                item = {"raw": acc[:chunk_len], "stride": stride}
+                if stream:
+                    item["partial"] = False
+                yield item
+                _stride_left = stride_left
+                acc = acc[chunk_len - stride_left - stride_right :]
+    # Last chunk
+    if len(acc) > stride_left:
+        item = {"raw": acc, "stride": (_stride_left, 0)}
+        if stream:
+            item["partial"] = False
+        yield item
+
+
+def _ffmpeg_stream(ffmpeg_command, buflen: int):
+    """
+    Internal function to create the generator of data through ffmpeg
+    """
+    bufsize = 2**24  # 16Mo
+    try:
+        with subprocess.Popen(ffmpeg_command, stdout=subprocess.PIPE, bufsize=bufsize) as ffmpeg_process:
+            while True:
+                raw = ffmpeg_process.stdout.read(buflen)
+                if raw == b"":
+                    break
+                yield raw
+    except FileNotFoundError as error:
+        raise ValueError("ffmpeg was not found but is required to stream audio files from filename") from error
+
+
+def _get_microphone_name():
+    """
+    Retrieve the microphone name in Windows .
+    """
+    command = ["ffmpeg", "-list_devices", "true", "-f", "dshow", "-i", ""]
+
+    try:
+        ffmpeg_devices = subprocess.run(command, text=True, stderr=subprocess.PIPE, encoding="utf-8")
+        microphone_lines = [line for line in ffmpeg_devices.stderr.splitlines() if "(audio)" in line]
+
+        if microphone_lines:
+            microphone_name = microphone_lines[0].split('"')[1]
+            print(f"Using microphone: {microphone_name}")
+            return f"audio={microphone_name}"
+    except FileNotFoundError:
+        print("ffmpeg was not found. Please install it or make sure it is in your system PATH.")
+
+    return "default"
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/automatic_speech_recognition.py b/venv/lib/python3.13/site-packages/transformers/pipelines/automatic_speech_recognition.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4d1b96ea87f004abdfeae15b56e1b6523fefcba
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/automatic_speech_recognition.py
@@ -0,0 +1,678 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import defaultdict
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+import numpy as np
+import requests
+
+from ..generation import GenerationConfig
+from ..tokenization_utils import PreTrainedTokenizer
+from ..utils import is_torch_available, is_torchaudio_available, is_torchcodec_available, logging
+from .audio_utils import ffmpeg_read
+from .base import ChunkPipeline
+
+
+if TYPE_CHECKING:
+    from pyctcdecode import BeamSearchDecoderCTC
+
+    from ..feature_extraction_sequence_utils import SequenceFeatureExtractor
+    from ..modeling_utils import PreTrainedModel
+
+logger = logging.get_logger(__name__)
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES
+
+
+def rescale_stride(stride, ratio):
+    """
+    Rescales the stride values from audio space to tokens/logits space.
+
+    (160_000, 16_000, 16_000) -> (2000, 200, 200) for instance.
+    """
+    # Shape is [B, SEQ] for tokens
+    # [B, SEQ, V] for logits
+
+    new_strides = []
+    for input_n, left, right in stride:
+        token_n = int(round(input_n * ratio))
+        left = int(round(left / input_n * token_n))
+        right = int(round(right / input_n * token_n))
+        new_stride = (token_n, left, right)
+        new_strides.append(new_stride)
+
+    return new_strides
+
+
+def chunk_iter(inputs, feature_extractor, chunk_len, stride_left, stride_right, dtype=None):
+    inputs_len = inputs.shape[0]
+    step = chunk_len - stride_left - stride_right
+    for chunk_start_idx in range(0, inputs_len, step):
+        chunk_end_idx = chunk_start_idx + chunk_len
+        chunk = inputs[chunk_start_idx:chunk_end_idx]
+        processed = feature_extractor(
+            chunk,
+            sampling_rate=feature_extractor.sampling_rate,
+            return_tensors="pt",
+            return_attention_mask=True,
+        )
+        if dtype is not None:
+            processed = processed.to(dtype=dtype)
+        _stride_left = 0 if chunk_start_idx == 0 else stride_left
+        is_last = chunk_end_idx >= inputs_len
+        _stride_right = 0 if is_last else stride_right
+
+        chunk_len = chunk.shape[0]
+        stride = (chunk_len, _stride_left, _stride_right)
+        if chunk.shape[0] > _stride_left:
+            yield {"is_last": is_last, "stride": stride, **processed}
+        if is_last:
+            break
+
+
+def _find_longest_common_sequence(sequences, tokenizer):
+    # TODO  Use a faster algorithm this can probably be done in O(n)
+    # using suffix array.
+    # It might be tedious to do because of fault tolerance.
+    # We actually have a really good property which is that the total sequence
+    # MUST be those subsequences in order.
+    # Also the algorithm should be more tolerant to errors.
+    sequence = [tok_id for tok_id in sequences[0][0].tolist() if tok_id not in tokenizer.all_special_ids]
+    for new_seq in sequences[1:]:
+        new_sequence = [tok_id for tok_id in new_seq[0].tolist() if tok_id not in tokenizer.all_special_ids]
+
+        index = 0
+        max_ = 0.0
+        for i in range(1, len(new_sequence) + 1):
+            # epsilon to favor long perfect matches
+            eps = i / 10000.0
+            matches = np.sum(np.array(sequence[-i:]) == np.array(new_sequence[:i]))
+            matching = matches / i + eps
+            if matches > 1 and matching > max_:
+                index = i
+                max_ = matching
+        sequence.extend(new_sequence[index:])
+    return np.array(sequence)
+
+
+class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
+    """
+    Pipeline that aims at extracting spoken text contained within some audio.
+
+    The input can be either a raw waveform or a audio file. In case of the audio file, ffmpeg should be installed for
+    to support multiple audio formats
+
+    Unless the model you're using explicitly sets these generation parameters in its configuration files
+    (`generation_config.json`), the following default values will be used:
+    - max_new_tokens: 256
+    - num_beams: 5
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> transcriber = pipeline(model="openai/whisper-base")
+    >>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac")
+    {'text': ' He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered flour-fatten sauce.'}
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    Arguments:
+        model ([`PreTrainedModel`] or [`TFPreTrainedModel`]):
+            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
+            [`PreTrainedModel`] for PyTorch and [`TFPreTrainedModel`] for TensorFlow.
+        feature_extractor ([`SequenceFeatureExtractor`]):
+            The feature extractor that will be used by the pipeline to encode waveform for the model.
+        tokenizer ([`PreTrainedTokenizer`]):
+            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
+            [`PreTrainedTokenizer`].
+        decoder (`pyctcdecode.BeamSearchDecoderCTC`, *optional*):
+            [PyCTCDecode's
+            BeamSearchDecoderCTC](https://github.com/kensho-technologies/pyctcdecode/blob/2fd33dc37c4111417e08d89ccd23d28e9b308d19/pyctcdecode/decoder.py#L180)
+            can be passed for language model boosted decoding. See [`Wav2Vec2ProcessorWithLM`] for more information.
+        chunk_length_s (`float`, *optional*, defaults to 0):
+            The input length for in each chunk. If `chunk_length_s = 0` then chunking is disabled (default).
+
+            <Tip>
+
+            For more information on how to effectively use `chunk_length_s`, please have a look at the [ASR chunking
+            blog post](https://huggingface.co/blog/asr-chunking).
+
+            </Tip>
+
+        stride_length_s (`float`, *optional*, defaults to `chunk_length_s / 6`):
+            The length of stride on the left and right of each chunk. Used only with `chunk_length_s > 0`. This enables
+            the model to *see* more context and infer letters better than without this context but the pipeline
+            discards the stride bits at the end to make the final reconstitution as perfect as possible.
+
+            <Tip>
+
+            For more information on how to effectively use `stride_length_s`, please have a look at the [ASR chunking
+            blog post](https://huggingface.co/blog/asr-chunking).
+
+            </Tip>
+
+        framework (`str`, *optional*):
+            The framework to use, either `"pt"` for PyTorch or `"tf"` for TensorFlow. The specified framework must be
+            installed. If no framework is specified, will default to the one currently installed. If no framework is
+            specified and both frameworks are installed, will default to the framework of the `model`, or to PyTorch if
+            no model is provided.
+        device (Union[`int`, `torch.device`], *optional*):
+            Device ordinal for CPU/GPU supports. Setting this to `None` will leverage CPU, a positive will run the
+            model on the associated CUDA device id.
+    """
+
+    _pipeline_calls_generate = True
+    _load_processor = False
+    _load_image_processor = False
+    _load_feature_extractor = True
+    _load_tokenizer = True
+    # Make sure the docstring is updated when the default generation config is changed
+    _default_generation_config = GenerationConfig(
+        max_new_tokens=256,
+        num_beams=5,  # follows openai's whisper implementation
+    )
+
+    def __init__(
+        self,
+        model: "PreTrainedModel",
+        feature_extractor: Optional[Union["SequenceFeatureExtractor", str]] = None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
+        decoder: Optional[Union["BeamSearchDecoderCTC", str]] = None,
+        device: Optional[Union[int, "torch.device"]] = None,
+        **kwargs,
+    ):
+        # set the model type so we can check we have the right pre- and post-processing parameters
+        if model.config.model_type == "whisper":
+            self.type = "seq2seq_whisper"
+        elif model.__class__.__name__ in MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES.values():
+            self.type = "seq2seq"
+        elif (
+            feature_extractor._processor_class
+            and feature_extractor._processor_class.endswith("WithLM")
+            and decoder is not None
+        ):
+            self.decoder = decoder
+            self.type = "ctc_with_lm"
+        else:
+            self.type = "ctc"
+
+        super().__init__(model, tokenizer, feature_extractor, device=device, **kwargs)
+
+    def __call__(self, inputs: Union[np.ndarray, bytes, str, dict], **kwargs: Any) -> list[dict[str, Any]]:
+        """
+        Transcribe the audio sequence(s) given as inputs to text. See the [`AutomaticSpeechRecognitionPipeline`]
+        documentation for more information.
+
+        Args:
+            inputs (`np.ndarray` or `bytes` or `str` or `dict`):
+                The inputs is either :
+                    - `str` that is either the filename of a local audio file, or a public URL address to download the
+                      audio file. The file will be read at the correct sampling rate to get the waveform using
+                      *ffmpeg*. This requires *ffmpeg* to be installed on the system.
+                    - `bytes` it is supposed to be the content of an audio file and is interpreted by *ffmpeg* in the
+                      same way.
+                    - (`np.ndarray` of shape (n, ) of type `np.float32` or `np.float64`)
+                        Raw audio at the correct sampling rate (no further check will be done)
+                    - `dict` form can be used to pass raw audio sampled at arbitrary `sampling_rate` and let this
+                      pipeline do the resampling. The dict must be in the format `{"sampling_rate": int, "raw":
+                      np.array}` with optionally a `"stride": (left: int, right: int)` than can ask the pipeline to
+                      treat the first `left` samples and last `right` samples to be ignored in decoding (but used at
+                      inference to provide more context to the model). Only use `stride` with CTC models.
+            return_timestamps (*optional*, `str` or `bool`):
+                Only available for pure CTC models (Wav2Vec2, HuBERT, etc) and the Whisper model. Not available for
+                other sequence-to-sequence models.
+
+                For CTC models, timestamps can take one of two formats:
+                    - `"char"`: the pipeline will return timestamps along the text for every character in the text. For
+                        instance, if you get `[{"text": "h", "timestamp": (0.5, 0.6)}, {"text": "i", "timestamp": (0.7,
+                        0.9)}]`, then it means the model predicts that the letter "h" was spoken after `0.5` and before
+                        `0.6` seconds.
+                    - `"word"`: the pipeline will return timestamps along the text for every word in the text. For
+                        instance, if you get `[{"text": "hi ", "timestamp": (0.5, 0.9)}, {"text": "there", "timestamp":
+                        (1.0, 1.5)}]`, then it means the model predicts that the word "hi" was spoken after `0.5` and
+                        before `0.9` seconds.
+
+                For the Whisper model, timestamps can take one of two formats:
+                    - `"word"`: same as above for word-level CTC timestamps. Word-level timestamps are predicted
+                        through the *dynamic-time warping (DTW)* algorithm, an approximation to word-level timestamps
+                        by inspecting the cross-attention weights.
+                    - `True`: the pipeline will return timestamps along the text for *segments* of words in the text.
+                        For instance, if you get `[{"text": " Hi there!", "timestamp": (0.5, 1.5)}]`, then it means the
+                        model predicts that the segment "Hi there!" was spoken after `0.5` and before `1.5` seconds.
+                        Note that a segment of text refers to a sequence of one or more words, rather than individual
+                        words as with word-level timestamps.
+            generate_kwargs (`dict`, *optional*):
+                The dictionary of ad-hoc parametrization of `generate_config` to be used for the generation call. For a
+                complete overview of generate, check the [following
+                guide](https://huggingface.co/docs/transformers/en/main_classes/text_generation).
+
+        Return:
+            `Dict`: A dictionary with the following keys:
+                - **text** (`str`): The recognized text.
+                - **chunks** (*optional(, `list[Dict]`)
+                    When using `return_timestamps`, the `chunks` will become a list containing all the various text
+                    chunks identified by the model, *e.g.* `[{"text": "hi ", "timestamp": (0.5, 0.9)}, {"text":
+                    "there", "timestamp": (1.0, 1.5)}]`. The original full text can roughly be recovered by doing
+                    `"".join(chunk["text"] for chunk in output["chunks"])`.
+        """
+        return super().__call__(inputs, **kwargs)
+
+    def _sanitize_parameters(
+        self,
+        chunk_length_s=None,
+        stride_length_s=None,
+        ignore_warning=None,
+        decoder_kwargs=None,
+        return_timestamps=None,
+        return_language=None,
+        **generate_kwargs,
+    ):
+        preprocess_params = {}
+        forward_params = {}
+        postprocess_params = {}
+
+        # Preprocess params
+        if chunk_length_s is not None:
+            if self.type in ["seq2seq", "seq2seq_whisper"] and not ignore_warning:
+                type_warning = (
+                    "Using `chunk_length_s` is very experimental with seq2seq models. The results will not necessarily"
+                    " be entirely accurate and will have caveats. More information:"
+                    " https://github.com/huggingface/transformers/pull/20104. Ignore this warning with pipeline(...,"
+                    " ignore_warning=True)."
+                )
+                if self.type == "seq2seq_whisper":
+                    type_warning += (
+                        " To use Whisper for long-form transcription, use rather the model's `generate` method directly "
+                        "as the model relies on it's own chunking mechanism (cf. Whisper original paper, section 3.8. "
+                        "Long-form Transcription)."
+                    )
+                logger.warning(type_warning)
+            preprocess_params["chunk_length_s"] = chunk_length_s
+        if stride_length_s is not None:
+            preprocess_params["stride_length_s"] = stride_length_s
+
+        # Forward params
+        # BC: accept a dictionary of generation kwargs (as opposed to **generate_kwargs)
+        if "generate_kwargs" in generate_kwargs:
+            forward_params.update(generate_kwargs.pop("generate_kwargs"))
+        # Default use for kwargs: they are generation-time kwargs
+        forward_params.update(generate_kwargs)
+
+        if getattr(self, "assistant_model", None) is not None:
+            forward_params["assistant_model"] = self.assistant_model
+        if getattr(self, "assistant_tokenizer", None) is not None:
+            forward_params["tokenizer"] = self.tokenizer
+            forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+        # Postprocess params
+        if decoder_kwargs is not None:
+            postprocess_params["decoder_kwargs"] = decoder_kwargs
+        if return_language is not None:
+            if self.type != "seq2seq_whisper":
+                raise ValueError("Only Whisper can return language for now.")
+            postprocess_params["return_language"] = return_language
+
+        # Parameter used in more than one place
+        # in some models like whisper, the generation config has a `return_timestamps` key
+        if hasattr(self, "generation_config") and hasattr(self.generation_config, "return_timestamps"):
+            return_timestamps = return_timestamps or self.generation_config.return_timestamps
+
+        if return_timestamps is not None:
+            # Check whether we have a valid setting for return_timestamps and throw an error before we perform a forward pass
+            if self.type == "seq2seq" and return_timestamps:
+                raise ValueError("We cannot return_timestamps yet on non-CTC models apart from Whisper!")
+            if self.type == "ctc_with_lm" and return_timestamps != "word":
+                raise ValueError("CTC with LM can only predict word level timestamps, set `return_timestamps='word'`")
+            if self.type == "ctc" and return_timestamps not in ["char", "word"]:
+                raise ValueError(
+                    "CTC can either predict character level timestamps, or word level timestamps. "
+                    "Set `return_timestamps='char'` or `return_timestamps='word'` as required."
+                )
+            if self.type == "seq2seq_whisper" and return_timestamps == "char":
+                raise ValueError(
+                    "Whisper cannot return `char` timestamps, only word level or segment level timestamps. "
+                    "Use `return_timestamps='word'` or `return_timestamps=True` respectively."
+                )
+            forward_params["return_timestamps"] = return_timestamps
+            postprocess_params["return_timestamps"] = return_timestamps
+
+        return preprocess_params, forward_params, postprocess_params
+
+    def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None):
+        if isinstance(inputs, str):
+            if inputs.startswith("http://") or inputs.startswith("https://"):
+                # We need to actually check for a real protocol, otherwise it's impossible to use a local file
+                # like http_huggingface_co.png
+                inputs = requests.get(inputs).content
+            else:
+                with open(inputs, "rb") as f:
+                    inputs = f.read()
+
+        if isinstance(inputs, bytes):
+            inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate)
+
+        stride = None
+        extra = {}
+
+        if is_torch_available():
+            import torch
+
+            if isinstance(inputs, torch.Tensor):
+                inputs = inputs.cpu().numpy()
+
+        if is_torchcodec_available():
+            import torchcodec
+
+            if isinstance(inputs, torchcodec.decoders.AudioDecoder):
+                _audio_samples = inputs.get_all_samples()
+
+                # torchcodec always returns (num_channels, num_samples)
+                # while before (datasets < 4.0) we had (2, num_samples) if stereo, (num_samples,) if mono
+                _array = _audio_samples.data
+                _array = _array[0] if _array.ndim == 2 and _array.shape[0] == 1 else _array
+                inputs = {"array": _array, "sampling_rate": _audio_samples.sample_rate}
+
+        if isinstance(inputs, dict):
+            stride = inputs.pop("stride", None)
+            # Accepting `"array"` which is the key defined in `datasets` for
+            # better integration
+            if not ("sampling_rate" in inputs and ("raw" in inputs or "array" in inputs)):
+                raise ValueError(
+                    "When passing a dictionary to AutomaticSpeechRecognitionPipeline, the dict needs to contain a "
+                    '"raw" key containing the numpy array or torch tensor representing the audio and a "sampling_rate" key, '
+                    "containing the sampling_rate associated with that array"
+                )
+
+            _inputs = inputs.pop("raw", None)
+            if _inputs is None:
+                # Remove path which will not be used from `datasets`.
+                inputs.pop("path", None)
+                _inputs = inputs.pop("array", None)
+            in_sampling_rate = inputs.pop("sampling_rate")
+            extra = inputs
+            inputs = _inputs
+            if in_sampling_rate != self.feature_extractor.sampling_rate:
+                if is_torchaudio_available():
+                    from torchaudio import functional as F
+                else:
+                    raise ImportError(
+                        "torchaudio is required to resample audio samples in AutomaticSpeechRecognitionPipeline. "
+                        "The torchaudio package can be installed through: `pip install torchaudio`."
+                    )
+
+                inputs = F.resample(
+                    torch.from_numpy(inputs) if isinstance(inputs, np.ndarray) else inputs,
+                    in_sampling_rate,
+                    in_sampling_rate,
+                    self.feature_extractor.sampling_rate,
+                ).numpy()
+                ratio = self.feature_extractor.sampling_rate / in_sampling_rate
+            else:
+                ratio = 1
+            if stride is not None:
+                if stride[0] + stride[1] > inputs.shape[0]:
+                    raise ValueError("Stride is too large for input")
+
+                # Stride needs to get the chunk length here, it's going to get
+                # swallowed by the `feature_extractor` later, and then batching
+                # can add extra data in the inputs, so we need to keep track
+                # of the original length in the stride so we can cut properly.
+                stride = (inputs.shape[0], int(round(stride[0] * ratio)), int(round(stride[1] * ratio)))
+        if not isinstance(inputs, (np.ndarray, torch.Tensor)):
+            raise TypeError(f"We expect a numpy ndarray or torch tensor as input, got `{type(inputs)}`")
+        if inputs.ndim != 1:
+            logger.warning(
+                f"We expect a single channel audio input for AutomaticSpeechRecognitionPipeline, got {inputs.ndim}. Taking the mean of the channels for mono conversion."
+            )
+            inputs = inputs.mean(axis=0)
+
+        if chunk_length_s:
+            if stride_length_s is None:
+                stride_length_s = chunk_length_s / 6
+
+            if isinstance(stride_length_s, (int, float)):
+                stride_length_s = [stride_length_s, stride_length_s]
+
+            # XXX: Carefully, this variable will not exist in `seq2seq` setting.
+            # Currently chunking is not possible at this level for `seq2seq` so
+            # it's ok.
+            align_to = getattr(self.model.config, "inputs_to_logits_ratio", 1)
+            chunk_len = int(round(chunk_length_s * self.feature_extractor.sampling_rate / align_to) * align_to)
+            stride_left = int(round(stride_length_s[0] * self.feature_extractor.sampling_rate / align_to) * align_to)
+            stride_right = int(round(stride_length_s[1] * self.feature_extractor.sampling_rate / align_to) * align_to)
+
+            if chunk_len < stride_left + stride_right:
+                raise ValueError("Chunk length must be superior to stride length")
+
+            for item in chunk_iter(inputs, self.feature_extractor, chunk_len, stride_left, stride_right, self.dtype):
+                yield {**item, **extra}
+        else:
+            if self.type == "seq2seq_whisper" and inputs.shape[0] > self.feature_extractor.n_samples:
+                processed = self.feature_extractor(
+                    inputs,
+                    sampling_rate=self.feature_extractor.sampling_rate,
+                    truncation=False,
+                    padding="longest",
+                    return_tensors="pt",
+                    return_attention_mask=True,
+                )
+            else:
+                if self.type == "seq2seq_whisper" and stride is None:
+                    processed = self.feature_extractor(
+                        inputs,
+                        sampling_rate=self.feature_extractor.sampling_rate,
+                        return_tensors="pt",
+                        return_token_timestamps=True,
+                        return_attention_mask=True,
+                    )
+                    extra["num_frames"] = processed.pop("num_frames")
+                else:
+                    processed = self.feature_extractor(
+                        inputs,
+                        sampling_rate=self.feature_extractor.sampling_rate,
+                        return_tensors="pt",
+                        return_attention_mask=True,
+                    )
+            if self.dtype is not None:
+                processed = processed.to(dtype=self.dtype)
+            if stride is not None:
+                if self.type == "seq2seq":
+                    raise ValueError("Stride is only usable with CTC models, try removing it !")
+
+                processed["stride"] = stride
+            yield {"is_last": True, **processed, **extra}
+
+    def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs):
+        attention_mask = model_inputs.pop("attention_mask", None)
+        stride = model_inputs.pop("stride", None)
+        num_frames = model_inputs.pop("num_frames", None)
+        is_last = model_inputs.pop("is_last")
+
+        if stride is not None and num_frames is not None:
+            raise ValueError("num_frames must be used only when stride is None")
+
+        if self.type in {"seq2seq", "seq2seq_whisper"}:
+            # Consume values so we can let extra information flow freely through
+            # the pipeline (important for `partial` in microphone)
+            if "input_features" in model_inputs:
+                inputs = model_inputs.pop("input_features")
+            elif "input_values" in model_inputs:
+                inputs = model_inputs.pop("input_values")
+            else:
+                raise ValueError(
+                    "Seq2Seq speech recognition model requires either a "
+                    f"`input_features` or `input_values` key, but only has {model_inputs.keys()}"
+                )
+
+            # custom processing for Whisper timestamps and word-level timestamps
+            return_timestamps = return_timestamps or getattr(self.generation_config, "return_timestamps", False)
+            if return_timestamps and self.type == "seq2seq_whisper":
+                generate_kwargs["return_timestamps"] = bool(return_timestamps)
+                if return_timestamps == "word":
+                    generate_kwargs["return_token_timestamps"] = True
+                    generate_kwargs["return_segments"] = True
+
+            # User-defined `generation_config` passed to the pipeline call take precedence
+            if "generation_config" not in generate_kwargs:
+                generate_kwargs["generation_config"] = self.generation_config
+
+            main_input_name = self.model.main_input_name if hasattr(self.model, "main_input_name") else "inputs"
+            generate_kwargs = {
+                main_input_name: inputs,
+                "attention_mask": attention_mask,
+                **generate_kwargs,
+            }
+            tokens = self.model.generate(**generate_kwargs)
+
+            # whisper longform generation stores timestamps in "segments"
+            if return_timestamps == "word" and self.type == "seq2seq_whisper":
+                if "segments" not in tokens:
+                    out = {"tokens": tokens["sequences"], "token_timestamps": tokens["token_timestamps"]}
+                else:
+                    token_timestamps = [
+                        torch.cat([segment["token_timestamps"] for segment in segment_list])
+                        for segment_list in tokens["segments"]
+                    ]
+                    out = {"tokens": tokens["sequences"], "token_timestamps": token_timestamps}
+            else:
+                out = {"tokens": tokens}
+            if self.type == "seq2seq_whisper":
+                if stride is not None:
+                    out["stride"] = stride
+
+        else:
+            inputs = {
+                self.model.main_input_name: model_inputs.pop(self.model.main_input_name),
+                "attention_mask": attention_mask,
+            }
+            outputs = self.model(**inputs)
+            logits = outputs.logits
+
+            if self.type == "ctc_with_lm":
+                out = {"logits": logits}
+            else:
+                out = {"tokens": logits.argmax(dim=-1)}
+            if stride is not None:
+                # Send stride to `postprocess`.
+                # it needs to be handled there where
+                # the pieces are to be concatenated.
+                ratio = 1 / self.model.config.inputs_to_logits_ratio
+                if isinstance(stride, tuple):
+                    out["stride"] = rescale_stride([stride], ratio)[0]
+                else:
+                    out["stride"] = rescale_stride(stride, ratio)
+        # Leftover
+        extra = model_inputs
+        return {"is_last": is_last, **out, **extra}
+
+    def postprocess(
+        self, model_outputs, decoder_kwargs: Optional[dict] = None, return_timestamps=None, return_language=None
+    ):
+        # Optional return types
+        optional = {}
+
+        final_items = []
+        key = "logits" if self.type == "ctc_with_lm" else "tokens"
+        stride = None
+        for outputs in model_outputs:
+            if self.framework == "pt" and outputs[key].dtype in (torch.bfloat16, torch.float16):
+                items = outputs[key].to(torch.float32).numpy()
+            else:
+                items = outputs[key].numpy()
+            stride = outputs.get("stride", None)
+            if stride is not None and self.type in {"ctc", "ctc_with_lm"}:
+                total_n, left, right = stride
+                # Total_n might be < logits.shape[1]
+                # because of padding, that's why
+                # we need to reconstruct this information
+                # This won't work with left padding (which doesn't exist right now)
+                right_n = total_n - right
+                items = items[:, left:right_n]
+            final_items.append(items)
+
+        if stride and self.type == "seq2seq":
+            items = _find_longest_common_sequence(final_items, self.tokenizer)
+        elif self.type == "seq2seq_whisper":
+            time_precision = self.feature_extractor.chunk_length / self.model.config.max_source_positions
+            # Send the chunking back to seconds, it's easier to handle in whisper
+            sampling_rate = self.feature_extractor.sampling_rate
+            for output in model_outputs:
+                if "stride" in output:
+                    chunk_len, stride_left, stride_right = output["stride"]
+                    # Go back in seconds
+                    chunk_len /= sampling_rate
+                    stride_left /= sampling_rate
+                    stride_right /= sampling_rate
+                    output["stride"] = chunk_len, stride_left, stride_right
+
+            text, optional = self.tokenizer._decode_asr(
+                model_outputs,
+                return_timestamps=return_timestamps,
+                return_language=return_language,
+                time_precision=time_precision,
+            )
+        else:
+            items = np.concatenate(final_items, axis=1)
+            items = items.squeeze(0)
+
+        if self.type == "ctc_with_lm":
+            if decoder_kwargs is None:
+                decoder_kwargs = {}
+            beams = self.decoder.decode_beams(items, **decoder_kwargs)
+            text = beams[0][0]
+            if return_timestamps:
+                # Simply cast from pyctcdecode format to wav2vec2 format to leverage
+                # pre-existing code later
+                chunk_offset = beams[0][2]
+                offsets = []
+                for word, (start_offset, end_offset) in chunk_offset:
+                    offsets.append({"word": word, "start_offset": start_offset, "end_offset": end_offset})
+        elif self.type != "seq2seq_whisper":
+            skip_special_tokens = self.type != "ctc"
+            text = self.tokenizer.decode(items, skip_special_tokens=skip_special_tokens)
+            if return_timestamps:
+                offsets = self.tokenizer.decode(
+                    items, skip_special_tokens=skip_special_tokens, output_char_offsets=True
+                )["char_offsets"]
+                if return_timestamps == "word":
+                    offsets = self.tokenizer._get_word_offsets(offsets, self.tokenizer.replace_word_delimiter_char)
+
+        if return_timestamps and self.type not in {"seq2seq", "seq2seq_whisper"}:
+            chunks = []
+            for item in offsets:
+                start = item["start_offset"] * self.model.config.inputs_to_logits_ratio
+                start /= self.feature_extractor.sampling_rate
+
+                stop = item["end_offset"] * self.model.config.inputs_to_logits_ratio
+                stop /= self.feature_extractor.sampling_rate
+
+                chunks.append({"text": item[return_timestamps], "timestamp": (start, stop)})
+            optional["chunks"] = chunks
+
+        extra = defaultdict(list)
+        for output in model_outputs:
+            output.pop("tokens", None)
+            output.pop("logits", None)
+            output.pop("is_last", None)
+            output.pop("stride", None)
+            output.pop("token_timestamps", None)
+            for k, v in output.items():
+                extra[k].append(v)
+        return {"text": text, **optional, **extra}
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/base.py b/venv/lib/python3.13/site-packages/transformers/pipelines/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed8c97251a53b1001ae8dbf777371b660a991d82
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/base.py
@@ -0,0 +1,1590 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import copy
+import csv
+import importlib
+import json
+import os
+import pickle
+import sys
+import traceback
+import types
+import warnings
+from abc import ABC, abstractmethod
+from collections import UserDict
+from contextlib import contextmanager
+from os.path import abspath, exists
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+from ..dynamic_module_utils import custom_object_save
+from ..feature_extraction_utils import PreTrainedFeatureExtractor
+from ..generation import GenerationConfig
+from ..image_processing_utils import BaseImageProcessor
+from ..modelcard import ModelCard
+from ..models.auto import AutoConfig, AutoTokenizer
+from ..processing_utils import ProcessorMixin
+from ..tokenization_utils import PreTrainedTokenizer
+from ..utils import (
+    ModelOutput,
+    PushToHubMixin,
+    add_end_docstrings,
+    copy_func,
+    infer_framework,
+    is_tf_available,
+    is_torch_available,
+    is_torch_cuda_available,
+    is_torch_hpu_available,
+    is_torch_mlu_available,
+    is_torch_mps_available,
+    is_torch_musa_available,
+    is_torch_npu_available,
+    is_torch_xpu_available,
+    logging,
+)
+from ..utils.deprecation import deprecate_kwarg
+
+
+GenericTensor = Union[list["GenericTensor"], "torch.Tensor", "tf.Tensor"]
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from ..models.auto.modeling_tf_auto import TFAutoModel
+
+if is_torch_available() or TYPE_CHECKING:
+    import torch
+    from torch.utils.data import DataLoader, Dataset
+
+    from ..modeling_utils import PreTrainedModel
+    from ..models.auto.modeling_auto import AutoModel
+
+    # Re-export for backward compatibility
+    from .pt_utils import KeyDataset
+else:
+    Dataset = None
+    KeyDataset = None
+
+if TYPE_CHECKING:
+    from ..modeling_tf_utils import TFPreTrainedModel
+    from ..modeling_utils import PreTrainedModel
+
+
+logger = logging.get_logger(__name__)
+
+
+def no_collate_fn(items):
+    if len(items) != 1:
+        raise ValueError("This collate_fn is meant to be used with batch_size=1")
+    return items[0]
+
+
+def _pad(items, key, padding_value, padding_side):
+    batch_size = len(items)
+    if isinstance(items[0][key], torch.Tensor):
+        # Others include `attention_mask` etc...
+        shape = items[0][key].shape
+        dim = len(shape)
+        if dim == 1:
+            # We have a list of 1-dim torch tensors, which can be stacked without padding
+            return torch.cat([item[key] for item in items], dim=0)
+        if key in ["pixel_values", "image"]:
+            # This is probable image so padding shouldn't be necessary
+            # B, C, H, W
+            return torch.cat([item[key] for item in items], dim=0)
+        elif dim == 4 and key == "input_features":
+            # this is probably a mel spectrogram batched
+            return torch.cat([item[key] for item in items], dim=0)
+        max_length = max(item[key].shape[1] for item in items)
+        min_length = min(item[key].shape[1] for item in items)
+        dtype = items[0][key].dtype
+
+        if dim == 2:
+            if max_length == min_length:
+                # Bypass for `ImageGPT` which doesn't provide a padding value, yet
+                # we can consistently pad since the size should be matching
+                return torch.cat([item[key] for item in items], dim=0)
+            tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value
+        elif dim == 3:
+            tensor = torch.zeros((batch_size, max_length, shape[-1]), dtype=dtype) + padding_value
+        elif dim == 4:
+            tensor = torch.zeros((batch_size, max_length, shape[-2], shape[-1]), dtype=dtype) + padding_value
+
+        for i, item in enumerate(items):
+            if dim == 2:
+                if padding_side == "left":
+                    tensor[i, -len(item[key][0]) :] = item[key][0].clone()
+                else:
+                    tensor[i, : len(item[key][0])] = item[key][0].clone()
+            elif dim == 3:
+                if padding_side == "left":
+                    tensor[i, -len(item[key][0]) :, :] = item[key][0].clone()
+                else:
+                    tensor[i, : len(item[key][0]), :] = item[key][0].clone()
+            elif dim == 4:
+                if padding_side == "left":
+                    tensor[i, -len(item[key][0]) :, :, :] = item[key][0].clone()
+                else:
+                    tensor[i, : len(item[key][0]), :, :] = item[key][0].clone()
+
+        return tensor
+    else:
+        return [item[key] for item in items]
+
+
+def pad_collate_fn(tokenizer, feature_extractor):
+    # Tokenizer
+    t_padding_side = None
+    # Feature extractor
+    f_padding_side = None
+    if tokenizer is None and feature_extractor is None:
+        raise ValueError("Pipeline without tokenizer or feature_extractor cannot do batching")
+    if tokenizer is not None:
+        if tokenizer.pad_token_id is None:
+            raise ValueError(
+                "Pipeline with tokenizer without pad_token cannot do batching. You can try to set it with "
+                "`pipe.tokenizer.pad_token_id = model.config.eos_token_id`."
+            )
+        else:
+            t_padding_value = tokenizer.pad_token_id
+            t_padding_side = tokenizer.padding_side
+    if feature_extractor is not None:
+        # Feature extractor can be images, where no padding is expected
+        f_padding_value = getattr(feature_extractor, "padding_value", None)
+        f_padding_side = getattr(feature_extractor, "padding_side", None)
+
+    if t_padding_side is not None and f_padding_side is not None and t_padding_side != f_padding_side:
+        raise ValueError(
+            f"The feature extractor, and tokenizer don't agree on padding side {t_padding_side} != {f_padding_side}"
+        )
+    padding_side = "right"
+    if t_padding_side is not None:
+        padding_side = t_padding_side
+    if f_padding_side is not None:
+        padding_side = f_padding_side
+
+    def inner(items):
+        keys = set(items[0].keys())
+        for item in items:
+            if set(item.keys()) != keys:
+                raise ValueError(
+                    f"The elements of the batch contain different keys. Cannot batch them ({set(item.keys())} !="
+                    f" {keys})"
+                )
+        # input_values, input_pixels, input_ids, ...
+        padded = {}
+        for key in keys:
+            if key == "input_ids":
+                # ImageGPT uses a feature extractor
+                if tokenizer is None and feature_extractor is not None:
+                    _padding_value = f_padding_value
+                else:
+                    _padding_value = t_padding_value
+            elif key in {"input_values", "pixel_values", "input_features"}:
+                _padding_value = f_padding_value
+            elif key in {"p_mask", "special_tokens_mask"}:
+                _padding_value = 1
+            elif key in {"attention_mask", "token_type_ids"}:
+                _padding_value = 0
+            else:
+                # This is likely another random key maybe even user provided
+                _padding_value = 0
+            padded[key] = _pad(items, key, _padding_value, padding_side)
+        return padded
+
+    return inner
+
+
+def infer_framework_load_model(
+    model,
+    config: AutoConfig,
+    model_classes: Optional[dict[str, tuple[type]]] = None,
+    task: Optional[str] = None,
+    framework: Optional[str] = None,
+    **model_kwargs,
+):
+    """
+    Select framework (TensorFlow or PyTorch) to use from the `model` passed. Returns a tuple (framework, model).
+
+    If `model` is instantiated, this function will just infer the framework from the model class. Otherwise `model` is
+    actually a checkpoint name and this method will try to instantiate it using `model_classes`. Since we don't want to
+    instantiate the model twice, this model is returned for use by the pipeline.
+
+    If both frameworks are installed and available for `model`, PyTorch is selected.
+
+    Args:
+        model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel]`):
+            The model to infer the framework from. If `str`, a checkpoint name. The model to infer the framewrok from.
+        config ([`AutoConfig`]):
+            The config associated with the model to help using the correct class
+        model_classes (dictionary `str` to `type`, *optional*):
+            A mapping framework to class.
+        task (`str`):
+            The task defining which pipeline will be returned.
+        model_kwargs:
+            Additional dictionary of keyword arguments passed along to the model's `from_pretrained(...,
+            **model_kwargs)` function.
+
+    Returns:
+        `Tuple`: A tuple framework, model.
+    """
+    if not is_tf_available() and not is_torch_available():
+        raise RuntimeError(
+            "At least one of TensorFlow 2.0 or PyTorch should be installed. "
+            "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
+            "To install PyTorch, read the instructions at https://pytorch.org/."
+        )
+    if isinstance(model, str):
+        model_kwargs["_from_pipeline"] = task
+        class_tuple = ()
+        look_pt = is_torch_available() and framework in {"pt", None}
+        look_tf = is_tf_available() and framework in {"tf", None}
+        if model_classes:
+            if look_pt:
+                class_tuple = class_tuple + model_classes.get("pt", (AutoModel,))
+            if look_tf:
+                class_tuple = class_tuple + model_classes.get("tf", (TFAutoModel,))
+        if config.architectures:
+            classes = []
+            for architecture in config.architectures:
+                transformers_module = importlib.import_module("transformers")
+                if look_pt:
+                    _class = getattr(transformers_module, architecture, None)
+                    if _class is not None:
+                        classes.append(_class)
+                if look_tf:
+                    _class = getattr(transformers_module, f"TF{architecture}", None)
+                    if _class is not None:
+                        classes.append(_class)
+            class_tuple = class_tuple + tuple(classes)
+
+        if len(class_tuple) == 0:
+            raise ValueError(f"Pipeline cannot infer suitable model classes from {model}")
+
+        all_traceback = {}
+        for model_class in class_tuple:
+            kwargs = model_kwargs.copy()
+            if framework == "pt" and model.endswith(".h5"):
+                kwargs["from_tf"] = True
+                logger.warning(
+                    "Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. "
+                    "Trying to load the model with PyTorch."
+                )
+            elif framework == "tf" and model.endswith(".bin"):
+                kwargs["from_pt"] = True
+                logger.warning(
+                    "Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. "
+                    "Trying to load the model with Tensorflow."
+                )
+
+            try:
+                model = model_class.from_pretrained(model, **kwargs)
+                if hasattr(model, "eval"):
+                    model = model.eval()
+                # Stop loading on the first successful load.
+                break
+            except (OSError, ValueError, TypeError, RuntimeError):
+                # `from_pretrained` may raise a `TypeError` or `RuntimeError` when the requested `dtype`
+                # is not supported on the execution device (e.g. bf16 on a consumer GPU). We capture those so
+                # we can transparently retry the load in float32 before surfacing an error to the user.
+                fallback_tried = False
+                if is_torch_available() and ("dtype" in kwargs):
+                    import torch  # local import to avoid unnecessarily importing torch for TF/JAX users
+
+                    fallback_tried = True
+                    fp32_kwargs = kwargs.copy()
+                    fp32_kwargs["dtype"] = torch.float32
+
+                    try:
+                        model = model_class.from_pretrained(model, **fp32_kwargs)
+                        if hasattr(model, "eval"):
+                            model = model.eval()
+                        logger.warning(
+                            "Falling back to torch.float32 because loading with the original dtype failed on the"
+                            " target device."
+                        )
+                        break
+                    except Exception:
+                        # If it still fails, capture the traceback and continue to the next class.
+                        all_traceback[model_class.__name__] = traceback.format_exc()
+                        continue
+
+                # If no fallback was attempted or it also failed, record the original traceback.
+                if not fallback_tried:
+                    all_traceback[model_class.__name__] = traceback.format_exc()
+                continue
+
+        if isinstance(model, str):
+            error = ""
+            for class_name, trace in all_traceback.items():
+                error += f"while loading with {class_name}, an error is thrown:\n{trace}\n"
+            raise ValueError(
+                f"Could not load model {model} with any of the following classes: {class_tuple}. See the original errors:\n\n{error}\n"
+            )
+
+    if framework is None:
+        framework = infer_framework(model.__class__)
+    return framework, model
+
+
+def infer_framework_from_model(
+    model,
+    model_classes: Optional[dict[str, tuple[type]]] = None,
+    task: Optional[str] = None,
+    framework: Optional[str] = None,
+    **model_kwargs,
+):
+    """
+    Select framework (TensorFlow or PyTorch) to use from the `model` passed. Returns a tuple (framework, model).
+
+    If `model` is instantiated, this function will just infer the framework from the model class. Otherwise `model` is
+    actually a checkpoint name and this method will try to instantiate it using `model_classes`. Since we don't want to
+    instantiate the model twice, this model is returned for use by the pipeline.
+
+    If both frameworks are installed and available for `model`, PyTorch is selected.
+
+    Args:
+        model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel]`):
+            The model to infer the framework from. If `str`, a checkpoint name. The model to infer the framewrok from.
+        model_classes (dictionary `str` to `type`, *optional*):
+            A mapping framework to class.
+        task (`str`):
+            The task defining which pipeline will be returned.
+        model_kwargs:
+            Additional dictionary of keyword arguments passed along to the model's `from_pretrained(...,
+            **model_kwargs)` function.
+
+    Returns:
+        `Tuple`: A tuple framework, model.
+    """
+    if isinstance(model, str):
+        config = AutoConfig.from_pretrained(model, _from_pipeline=task, **model_kwargs)
+    else:
+        config = model.config
+    return infer_framework_load_model(
+        model, config, model_classes=model_classes, _from_pipeline=task, task=task, framework=framework, **model_kwargs
+    )
+
+
+def get_framework(model, revision: Optional[str] = None):
+    """
+    Select framework (TensorFlow or PyTorch) to use.
+
+    Args:
+        model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel]`):
+            If both frameworks are installed, picks the one corresponding to the model passed (either a model class or
+            the model name). If no specific model is provided, defaults to using PyTorch.
+    """
+    warnings.warn(
+        "`get_framework` is deprecated and will be removed in v5, use `infer_framework_from_model` instead.",
+        FutureWarning,
+    )
+    if not is_tf_available() and not is_torch_available():
+        raise RuntimeError(
+            "At least one of TensorFlow 2.0 or PyTorch should be installed. "
+            "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
+            "To install PyTorch, read the instructions at https://pytorch.org/."
+        )
+    if isinstance(model, str):
+        if is_torch_available() and not is_tf_available():
+            model = AutoModel.from_pretrained(model, revision=revision)
+        elif is_tf_available() and not is_torch_available():
+            model = TFAutoModel.from_pretrained(model, revision=revision)
+        else:
+            try:
+                model = AutoModel.from_pretrained(model, revision=revision)
+            except OSError:
+                model = TFAutoModel.from_pretrained(model, revision=revision)
+
+    framework = infer_framework(model.__class__)
+    return framework
+
+
+def get_default_model_and_revision(
+    targeted_task: dict, framework: Optional[str], task_options: Optional[Any]
+) -> tuple[str, str]:
+    """
+    Select a default model to use for a given task. Defaults to pytorch if ambiguous.
+
+    Args:
+        targeted_task (`Dict`):
+           Dictionary representing the given task, that should contain default models
+
+        framework (`str`, None)
+           "pt", "tf" or None, representing a specific framework if it was specified, or None if we don't know yet.
+
+        task_options (`Any`, None)
+           Any further value required by the task to get fully specified, for instance (SRC, TGT) languages for
+           translation task.
+
+    Returns
+
+        Tuple:
+            - `str` The model string representing the default model for this pipeline.
+            - `str` The revision of the model.
+    """
+    if is_torch_available() and not is_tf_available():
+        framework = "pt"
+    elif is_tf_available() and not is_torch_available():
+        framework = "tf"
+
+    defaults = targeted_task["default"]
+    if task_options:
+        if task_options not in defaults:
+            raise ValueError(f"The task does not provide any default models for options {task_options}")
+        default_models = defaults[task_options]["model"]
+    elif "model" in defaults:
+        default_models = targeted_task["default"]["model"]
+    else:
+        # XXX This error message needs to be updated to be more generic if more tasks are going to become
+        # parametrized
+        raise ValueError('The task defaults can\'t be correctly selected. You probably meant "translation_xx_to_yy"')
+
+    if framework is None:
+        framework = "pt"
+
+    return default_models[framework]
+
+
+def load_assistant_model(
+    model: "PreTrainedModel",
+    assistant_model: Optional[Union[str, "PreTrainedModel"]],
+    assistant_tokenizer: Optional[PreTrainedTokenizer],
+) -> tuple[Optional["PreTrainedModel"], Optional[PreTrainedTokenizer]]:
+    """
+    Prepares the assistant model and the assistant tokenizer for a pipeline whose model that can call `generate`.
+
+    Args:
+        model ([`PreTrainedModel`]):
+            The main model that will be used by the pipeline to make predictions.
+        assistant_model (`str` or [`PreTrainedModel`], *optional*):
+            The assistant model that will be used by the pipeline to make predictions.
+        assistant_tokenizer ([`PreTrainedTokenizer`], *optional*):
+            The assistant tokenizer that will be used by the pipeline to encode data for the model.
+
+    Returns:
+        Tuple: The loaded assistant model and (optionally) the loaded tokenizer.
+    """
+    if not model.can_generate() or assistant_model is None:
+        return None, None
+
+    if getattr(model, "framework") != "pt" or not isinstance(model, PreTrainedModel):
+        raise ValueError(
+            "Assisted generation, triggered by the `assistant_model` argument, is only available for "
+            "`PreTrainedModel` model instances. For instance, TF or JAX models are not supported."
+        )
+
+    # If the model is passed as a string, load the model and the corresponding tokenizer
+    if isinstance(assistant_model, str):
+        assistant_config = AutoConfig.from_pretrained(assistant_model)
+        _, loaded_assistant_model = infer_framework_load_model(assistant_model, config=assistant_config)
+        loaded_assistant_model = loaded_assistant_model.to(device=model.device, dtype=model.dtype)
+        loaded_assistant_tokenizer = AutoTokenizer.from_pretrained(assistant_model)
+    else:
+        loaded_assistant_model = assistant_model
+        loaded_assistant_tokenizer = assistant_tokenizer
+
+    # Finally, let's check the tokenizers: if the two models have different tokenizers, we need to keep the assistant
+    # tokenizer
+    same_vocab_size = model.config.vocab_size == loaded_assistant_model.config.vocab_size
+    same_special_tokens = all(
+        getattr(model.config, token) == getattr(loaded_assistant_model.config, token)
+        for token in ("eos_token_id", "pad_token_id", "bos_token_id")
+    )
+    if same_vocab_size and same_special_tokens:
+        loaded_assistant_tokenizer = None
+    elif loaded_assistant_tokenizer is None:
+        raise ValueError(
+            "The assistant model has a different tokenizer than the main model. You should pass the assistant "
+            "tokenizer."
+        )
+
+    return loaded_assistant_model, loaded_assistant_tokenizer
+
+
+class PipelineException(Exception):
+    """
+    Raised by a [`Pipeline`] when handling __call__.
+
+    Args:
+        task (`str`): The task of the pipeline.
+        model (`str`): The model used by the pipeline.
+        reason (`str`): The error message to display.
+    """
+
+    def __init__(self, task: str, model: str, reason: str):
+        super().__init__(reason)
+
+        self.task = task
+        self.model = model
+
+
+class ArgumentHandler(ABC):
+    """
+    Base interface for handling arguments for each [`~pipelines.Pipeline`].
+    """
+
+    @abstractmethod
+    def __call__(self, *args, **kwargs):
+        raise NotImplementedError()
+
+
+class PipelineDataFormat:
+    """
+    Base class for all the pipeline supported data format both for reading and writing. Supported data formats
+    currently includes:
+
+    - JSON
+    - CSV
+    - stdin/stdout (pipe)
+
+    `PipelineDataFormat` also includes some utilities to work with multi-columns like mapping from datasets columns to
+    pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format.
+
+    Args:
+        output_path (`str`): Where to save the outgoing data.
+        input_path (`str`): Where to look for the input data.
+        column (`str`): The column to read.
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to overwrite the `output_path`.
+    """
+
+    SUPPORTED_FORMATS = ["json", "csv", "pipe"]
+
+    def __init__(
+        self,
+        output_path: Optional[str],
+        input_path: Optional[str],
+        column: Optional[str],
+        overwrite: bool = False,
+    ):
+        self.output_path = output_path
+        self.input_path = input_path
+        self.column = column.split(",") if column is not None else [""]
+        self.is_multi_columns = len(self.column) > 1
+
+        if self.is_multi_columns:
+            self.column = [tuple(c.split("=")) if "=" in c else (c, c) for c in self.column]
+
+        if output_path is not None and not overwrite:
+            if exists(abspath(self.output_path)):
+                raise OSError(f"{self.output_path} already exists on disk")
+
+        if input_path is not None:
+            if not exists(abspath(self.input_path)):
+                raise OSError(f"{self.input_path} doesn't exist on disk")
+
+    @abstractmethod
+    def __iter__(self):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def save(self, data: Union[dict, list[dict]]):
+        """
+        Save the provided data object with the representation for the current [`~pipelines.PipelineDataFormat`].
+
+        Args:
+            data (`dict` or list of `dict`): The data to store.
+        """
+        raise NotImplementedError()
+
+    def save_binary(self, data: Union[dict, list[dict]]) -> str:
+        """
+        Save the provided data object as a pickle-formatted binary data on the disk.
+
+        Args:
+            data (`dict` or list of `dict`): The data to store.
+
+        Returns:
+            `str`: Path where the data has been saved.
+        """
+        path, _ = os.path.splitext(self.output_path)
+        binary_path = os.path.extsep.join((path, "pickle"))
+
+        with open(binary_path, "wb+") as f_output:
+            pickle.dump(data, f_output)
+
+        return binary_path
+
+    @staticmethod
+    def from_str(
+        format: str,
+        output_path: Optional[str],
+        input_path: Optional[str],
+        column: Optional[str],
+        overwrite=False,
+    ) -> "PipelineDataFormat":
+        """
+        Creates an instance of the right subclass of [`~pipelines.PipelineDataFormat`] depending on `format`.
+
+        Args:
+            format (`str`):
+                The format of the desired pipeline. Acceptable values are `"json"`, `"csv"` or `"pipe"`.
+            output_path (`str`, *optional*):
+                Where to save the outgoing data.
+            input_path (`str`, *optional*):
+                Where to look for the input data.
+            column (`str`, *optional*):
+                The column to read.
+            overwrite (`bool`, *optional*, defaults to `False`):
+                Whether or not to overwrite the `output_path`.
+
+        Returns:
+            [`~pipelines.PipelineDataFormat`]: The proper data format.
+        """
+        if format == "json":
+            return JsonPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
+        elif format == "csv":
+            return CsvPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
+        elif format == "pipe":
+            return PipedPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
+        else:
+            raise KeyError(f"Unknown reader {format} (Available reader are json/csv/pipe)")
+
+
+class CsvPipelineDataFormat(PipelineDataFormat):
+    """
+    Support for pipelines using CSV data format.
+
+    Args:
+        output_path (`str`): Where to save the outgoing data.
+        input_path (`str`): Where to look for the input data.
+        column (`str`): The column to read.
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to overwrite the `output_path`.
+    """
+
+    def __init__(
+        self,
+        output_path: Optional[str],
+        input_path: Optional[str],
+        column: Optional[str],
+        overwrite=False,
+    ):
+        super().__init__(output_path, input_path, column, overwrite=overwrite)
+
+    def __iter__(self):
+        with open(self.input_path, "r") as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                if self.is_multi_columns:
+                    yield {k: row[c] for k, c in self.column}
+                else:
+                    yield row[self.column[0]]
+
+    def save(self, data: list[dict]):
+        """
+        Save the provided data object with the representation for the current [`~pipelines.PipelineDataFormat`].
+
+        Args:
+            data (`list[dict]`): The data to store.
+        """
+        with open(self.output_path, "w") as f:
+            if len(data) > 0:
+                writer = csv.DictWriter(f, list(data[0].keys()))
+                writer.writeheader()
+                writer.writerows(data)
+
+
+class JsonPipelineDataFormat(PipelineDataFormat):
+    """
+    Support for pipelines using JSON file format.
+
+    Args:
+        output_path (`str`): Where to save the outgoing data.
+        input_path (`str`): Where to look for the input data.
+        column (`str`): The column to read.
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to overwrite the `output_path`.
+    """
+
+    def __init__(
+        self,
+        output_path: Optional[str],
+        input_path: Optional[str],
+        column: Optional[str],
+        overwrite=False,
+    ):
+        super().__init__(output_path, input_path, column, overwrite=overwrite)
+
+        with open(input_path, "r") as f:
+            self._entries = json.load(f)
+
+    def __iter__(self):
+        for entry in self._entries:
+            if self.is_multi_columns:
+                yield {k: entry[c] for k, c in self.column}
+            else:
+                yield entry[self.column[0]]
+
+    def save(self, data: dict):
+        """
+        Save the provided data object in a json file.
+
+        Args:
+            data (`dict`): The data to store.
+        """
+        with open(self.output_path, "w") as f:
+            json.dump(data, f)
+
+
+class PipedPipelineDataFormat(PipelineDataFormat):
+    """
+    Read data from piped input to the python process. For multi columns data, columns should separated by \t
+
+    If columns are provided, then the output will be a dictionary with {column_x: value_x}
+
+    Args:
+        output_path (`str`): Where to save the outgoing data.
+        input_path (`str`): Where to look for the input data.
+        column (`str`): The column to read.
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to overwrite the `output_path`.
+    """
+
+    def __iter__(self):
+        for line in sys.stdin:
+            # Split for multi-columns
+            if "\t" in line:
+                line = line.split("\t")
+                if self.column:
+                    # Dictionary to map arguments
+                    yield {kwargs: l for (kwargs, _), l in zip(self.column, line)}
+                else:
+                    yield tuple(line)
+
+            # No dictionary to map arguments
+            else:
+                yield line
+
+    def save(self, data: dict):
+        """
+        Print the data.
+
+        Args:
+            data (`dict`): The data to store.
+        """
+        print(data)
+
+    def save_binary(self, data: Union[dict, list[dict]]) -> str:
+        if self.output_path is None:
+            raise KeyError(
+                "When using piped input on pipeline outputting large object requires an output file path. "
+                "Please provide such output path through --output argument."
+            )
+
+        return super().save_binary(data)
+
+
+class _ScikitCompat(ABC):
+    """
+    Interface layer for the Scikit and Keras compatibility.
+    """
+
+    @abstractmethod
+    def transform(self, X):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def predict(self, X):
+        raise NotImplementedError()
+
+
+def build_pipeline_init_args(
+    has_tokenizer: bool = False,
+    has_feature_extractor: bool = False,
+    has_image_processor: bool = False,
+    has_processor: bool = False,
+    supports_binary_output: bool = True,
+) -> str:
+    docstring = r"""
+    Arguments:
+        model ([`PreTrainedModel`] or [`TFPreTrainedModel`]):
+            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
+            [`PreTrainedModel`] for PyTorch and [`TFPreTrainedModel`] for TensorFlow."""
+    if has_tokenizer:
+        docstring += r"""
+        tokenizer ([`PreTrainedTokenizer`]):
+            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
+            [`PreTrainedTokenizer`]."""
+    if has_feature_extractor:
+        docstring += r"""
+        feature_extractor ([`SequenceFeatureExtractor`]):
+            The feature extractor that will be used by the pipeline to encode data for the model. This object inherits from
+            [`SequenceFeatureExtractor`]."""
+    if has_image_processor:
+        docstring += r"""
+        image_processor ([`BaseImageProcessor`]):
+            The image processor that will be used by the pipeline to encode data for the model. This object inherits from
+            [`BaseImageProcessor`]."""
+    if has_processor:
+        docstring += r"""
+        processor ([`ProcessorMixin`]):
+            The processor that will be used by the pipeline to encode data for the model. This object inherits from
+            [`ProcessorMixin`]. Processor is a composite object that might contain `tokenizer`, `feature_extractor`, and
+            `image_processor`."""
+    docstring += r"""
+        modelcard (`str` or [`ModelCard`], *optional*):
+            Model card attributed to the model for this pipeline.
+        framework (`str`, *optional*):
+            The framework to use, either `"pt"` for PyTorch or `"tf"` for TensorFlow. The specified framework must be
+            installed.
+
+            If no framework is specified, will default to the one currently installed. If no framework is specified and
+            both frameworks are installed, will default to the framework of the `model`, or to PyTorch if no model is
+            provided.
+        task (`str`, defaults to `""`):
+            A task-identifier for the pipeline.
+        num_workers (`int`, *optional*, defaults to 8):
+            When the pipeline will use *DataLoader* (when passing a dataset, on GPU for a Pytorch model), the number of
+            workers to be used.
+        batch_size (`int`, *optional*, defaults to 1):
+            When the pipeline will use *DataLoader* (when passing a dataset, on GPU for a Pytorch model), the size of
+            the batch to use, for inference this is not always beneficial, please read [Batching with
+            pipelines](https://huggingface.co/transformers/main_classes/pipelines.html#pipeline-batching) .
+        args_parser ([`~pipelines.ArgumentHandler`], *optional*):
+            Reference to the object in charge of parsing supplied pipeline parameters.
+        device (`int`, *optional*, defaults to -1):
+            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model on
+            the associated CUDA device id. You can pass native `torch.device` or a `str` too
+        dtype (`str` or `torch.dtype`, *optional*):
+            Sent directly as `model_kwargs` (just a simpler shortcut) to use the available precision for this model
+            (`torch.float16`, `torch.bfloat16`, ... or `"auto"`)"""
+    if supports_binary_output:
+        docstring += r"""
+        binary_output (`bool`, *optional*, defaults to `False`):
+            Flag indicating if the output the pipeline should happen in a serialized format (i.e., pickle) or as
+            the raw output data e.g. text."""
+    return docstring
+
+
+PIPELINE_INIT_ARGS = build_pipeline_init_args(
+    has_tokenizer=True,
+    has_feature_extractor=True,
+    has_image_processor=True,
+    has_processor=True,
+    supports_binary_output=True,
+)
+
+SUPPORTED_PEFT_TASKS = {
+    "document-question-answering": ["PeftModelForQuestionAnswering"],
+    "feature-extraction": ["PeftModelForFeatureExtraction", "PeftModel"],
+    "question-answering": ["PeftModelForQuestionAnswering"],
+    "summarization": ["PeftModelForSeq2SeqLM"],
+    "table-question-answering": ["PeftModelForQuestionAnswering"],
+    "text2text-generation": ["PeftModelForSeq2SeqLM"],
+    "text-classification": ["PeftModelForSequenceClassification"],
+    "sentiment-analysis": ["PeftModelForSequenceClassification"],
+    "text-generation": ["PeftModelForCausalLM"],
+    "token-classification": ["PeftModelForTokenClassification"],
+    "ner": ["PeftModelForTokenClassification"],
+    "translation": ["PeftModelForSeq2SeqLM"],
+    "translation_xx_to_yy": ["PeftModelForSeq2SeqLM"],
+    "zero-shot-classification": ["PeftModelForSequenceClassification"],
+}
+
+if is_torch_available():
+    from transformers.pipelines.pt_utils import (
+        PipelineChunkIterator,
+        PipelineDataset,
+        PipelineIterator,
+        PipelinePackIterator,
+    )
+
+
+@add_end_docstrings(
+    build_pipeline_init_args(
+        has_tokenizer=True, has_feature_extractor=True, has_image_processor=True, has_processor=True
+    )
+)
+class Pipeline(_ScikitCompat, PushToHubMixin):
+    """
+    The Pipeline class is the class from which all pipelines inherit. Refer to this class for methods shared across
+    different pipelines.
+
+    Base class implementing pipelined operations. Pipeline workflow is defined as a sequence of the following
+    operations:
+
+        Input -> Tokenization -> Model Inference -> Post-Processing (task dependent) -> Output
+
+    Pipeline supports running on CPU or GPU through the device argument (see below).
+
+    Some pipeline, like for instance [`FeatureExtractionPipeline`] (`'feature-extraction'`) output large tensor object
+    as nested-lists. In order to avoid dumping such large structure as textual data we provide the `binary_output`
+    constructor argument. If set to `True`, the output will be stored in the pickle format.
+    """
+
+    # These flags should be overridden for downstream pipelines. They indicate which preprocessing classes are
+    # used by each pipeline. The possible values are:
+    # - True (the class is mandatory, raise an error if it's not present in the repo)
+    # - None (the class is optional; it should be loaded if present in the repo but the pipeline can work without it)
+    # - False (the class is never used by the pipeline and should not be loaded even if present)
+    _load_processor = None
+    _load_image_processor = None
+    _load_feature_extractor = None
+    _load_tokenizer = None
+
+    # Pipelines that call `generate` have shared logic, e.g. preparing the generation config.
+    _pipeline_calls_generate = False
+
+    default_input_names = None
+
+    def __init__(
+        self,
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        tokenizer: Optional[PreTrainedTokenizer] = None,
+        feature_extractor: Optional[PreTrainedFeatureExtractor] = None,
+        image_processor: Optional[BaseImageProcessor] = None,
+        processor: Optional[ProcessorMixin] = None,
+        modelcard: Optional[ModelCard] = None,
+        framework: Optional[str] = None,
+        task: str = "",
+        device: Optional[Union[int, "torch.device"]] = None,
+        binary_output: bool = False,
+        **kwargs,
+    ):
+        # We need to pop them for _sanitize_parameters call later
+        _, _, _ = kwargs.pop("args_parser", None), kwargs.pop("torch_dtype", None), kwargs.pop("dtype", None)
+        if framework is None:
+            framework, model = infer_framework_load_model(model, config=model.config)
+        if framework in ("tf", "jax"):
+            logger.warning_once(
+                "TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We "
+                "recommend migrating to PyTorch classes or pinning your version of Transformers."
+            )
+
+        self.task = task
+        self.model = model
+        self.tokenizer = tokenizer
+        self.feature_extractor = feature_extractor
+        self.image_processor = image_processor
+        self.processor = processor
+        self.modelcard = modelcard
+        self.framework = framework
+
+        # `accelerate` device map
+        hf_device_map = getattr(self.model, "hf_device_map", None)
+
+        if hf_device_map is not None and device is not None:
+            raise ValueError(
+                "The model has been loaded with `accelerate` and therefore cannot be moved to a specific device. Please "
+                "discard the `device` argument when creating your pipeline object."
+            )
+
+        if device is None:
+            if hf_device_map is not None:
+                # Take the first device used by `accelerate`.
+                device = next(iter(hf_device_map.values()))
+            else:
+                device = 0
+
+        if is_torch_available() and self.framework == "pt":
+            if device == -1 and self.model.device is not None:
+                device = self.model.device
+            if isinstance(device, torch.device):
+                if (device.type == "xpu" and not is_torch_xpu_available(check_device=True)) or (
+                    device.type == "hpu" and not is_torch_hpu_available()
+                ):
+                    raise ValueError(f'{device} is not available, you should use device="cpu" instead')
+
+                self.device = device
+            elif isinstance(device, str):
+                if ("xpu" in device and not is_torch_xpu_available(check_device=True)) or (
+                    "hpu" in device and not is_torch_hpu_available()
+                ):
+                    raise ValueError(f'{device} is not available, you should use device="cpu" instead')
+
+                self.device = torch.device(device)
+            elif device < 0:
+                self.device = torch.device("cpu")
+            elif is_torch_mlu_available():
+                self.device = torch.device(f"mlu:{device}")
+            elif is_torch_musa_available():
+                self.device = torch.device(f"musa:{device}")
+            elif is_torch_cuda_available():
+                self.device = torch.device(f"cuda:{device}")
+            elif is_torch_npu_available():
+                self.device = torch.device(f"npu:{device}")
+            elif is_torch_hpu_available():
+                self.device = torch.device(f"hpu:{device}")
+            elif is_torch_xpu_available(check_device=True):
+                self.device = torch.device(f"xpu:{device}")
+            elif is_torch_mps_available():
+                self.device = torch.device(f"mps:{device}")
+            else:
+                self.device = torch.device("cpu")
+        else:
+            self.device = device if device is not None else -1
+
+        if is_torch_available() and torch.distributed.is_available() and torch.distributed.is_initialized():
+            self.device = self.model.device
+        logger.warning(f"Device set to use {self.device}")
+
+        self.binary_output = binary_output
+
+        # We shouldn't call `model.to()` for models loaded with accelerate as well as the case that model is already on device
+        if (
+            self.framework == "pt"
+            and self.model.device != self.device
+            and not (isinstance(self.device, int) and self.device < 0)
+            and hf_device_map is None
+        ):
+            self.model.to(self.device)
+
+        # If it's a generation pipeline and the model can generate:
+        # 1 - create a local generation config. This is done to avoid side-effects on the model as we apply local
+        # tweaks to the generation config.
+        # 2 - load the assistant model if it is passed.
+        if self._pipeline_calls_generate and self.model.can_generate():
+            self.assistant_model, self.assistant_tokenizer = load_assistant_model(
+                self.model, kwargs.pop("assistant_model", None), kwargs.pop("assistant_tokenizer", None)
+            )
+            self.prefix = self.model.config.prefix if hasattr(self.model.config, "prefix") else None
+            # each pipeline with text generation capabilities should define its own default generation in a
+            # `_default_generation_config` class attribute
+            default_pipeline_generation_config = getattr(self, "_default_generation_config", GenerationConfig())
+            if hasattr(self.model, "_prepare_generation_config"):  # TF doesn't have `_prepare_generation_config`
+                # Uses `generate`'s logic to enforce the following priority of arguments:
+                # 1. user-defined config options in `**kwargs`
+                # 2. model's generation config values
+                # 3. pipeline's default generation config values
+                # NOTE: _prepare_generation_config creates a deep copy of the generation config before updating it,
+                # and returns all kwargs that were not used to update the generation config
+                prepared_generation_config, kwargs = self.model._prepare_generation_config(
+                    generation_config=default_pipeline_generation_config, use_model_defaults=True, **kwargs
+                )
+                self.generation_config = prepared_generation_config
+                # if the `max_new_tokens` is set to the pipeline default, but `max_length` is set to a non-default
+                # value: let's honor `max_length`. E.g. we want Whisper's default `max_length=448` take precedence
+                # over over the pipeline's length default.
+                if (
+                    default_pipeline_generation_config.max_new_tokens is not None  # there's a pipeline default
+                    and self.generation_config.max_new_tokens == default_pipeline_generation_config.max_new_tokens
+                    and self.generation_config.max_length is not None
+                    and self.generation_config.max_length != 20  # global default
+                ):
+                    self.generation_config.max_new_tokens = None
+            else:
+                # TODO (joao): no PT model should reach this line. However, some audio models with complex
+                # inheritance patterns do. Streamline those models such that this line is no longer needed.
+                # In those models, the default generation config is not (yet) used.
+                self.generation_config = copy.deepcopy(self.model.generation_config)
+            # Update the generation config with task specific params if they exist.
+            # NOTE: 1. `prefix` is pipeline-specific and doesn't exist in the generation config.
+            #       2. `task_specific_params` is a legacy feature and should be removed in a future version.
+            task_specific_params = self.model.config.task_specific_params
+            if task_specific_params is not None and task in task_specific_params:
+                this_task_params = task_specific_params.get(task)
+                if "prefix" in this_task_params:
+                    self.prefix = this_task_params.pop("prefix")
+                self.generation_config.update(**this_task_params)
+            # If the tokenizer has a pad token but the model doesn't, set it so that `generate` is aware of it.
+            if (
+                self.tokenizer is not None
+                and self.tokenizer.pad_token_id is not None
+                and self.generation_config.pad_token_id is None
+            ):
+                self.generation_config.pad_token_id = self.tokenizer.pad_token_id
+
+        self.call_count = 0
+        self._batch_size = kwargs.pop("batch_size", None)
+        self._num_workers = kwargs.pop("num_workers", None)
+        self._preprocess_params, self._forward_params, self._postprocess_params = self._sanitize_parameters(**kwargs)
+
+        # In processor only mode, we can get the modality processors from the processor
+        if self.processor is not None and all(
+            [self.tokenizer is None, self.feature_extractor is None, self.image_processor is None]
+        ):
+            self.tokenizer = getattr(self.processor, "tokenizer", None)
+            self.feature_extractor = getattr(self.processor, "feature_extractor", None)
+            self.image_processor = getattr(self.processor, "image_processor", None)
+
+        if self.image_processor is None and self.feature_extractor is not None:
+            if isinstance(self.feature_extractor, BaseImageProcessor):
+                # Backward compatible change, if users called
+                # ImageSegmentationPipeline(.., feature_extractor=MyFeatureExtractor())
+                # then we should keep working
+                self.image_processor = self.feature_extractor
+
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        safe_serialization: bool = True,
+        **kwargs: Any,
+    ):
+        """
+        Save the pipeline's model and tokenizer.
+
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                A path to the directory where to saved. It will be created if it doesn't exist.
+            safe_serialization (`str`):
+                Whether to save the model using `safetensors` or the traditional way for PyTorch or Tensorflow.
+            kwargs (`dict[str, Any]`, *optional*):
+                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        use_auth_token = kwargs.pop("use_auth_token", None)
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+                FutureWarning,
+            )
+            if kwargs.get("token") is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+
+        if os.path.isfile(save_directory):
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
+            return
+        os.makedirs(save_directory, exist_ok=True)
+
+        if hasattr(self, "_registered_impl"):
+            # Add info to the config
+            pipeline_info = self._registered_impl.copy()
+            custom_pipelines = {}
+            for task, info in pipeline_info.items():
+                if info["impl"] != self.__class__:
+                    continue
+
+                info = info.copy()
+                module_name = info["impl"].__module__
+                last_module = module_name.split(".")[-1]
+                # Change classes into their names/full names
+                info["impl"] = f"{last_module}.{info['impl'].__name__}"
+                info["pt"] = tuple(c.__name__ for c in info["pt"])
+                info["tf"] = tuple(c.__name__ for c in info["tf"])
+
+                custom_pipelines[task] = info
+            self.model.config.custom_pipelines = custom_pipelines
+            # Save the pipeline custom code
+            custom_object_save(self, save_directory)
+
+        kwargs["safe_serialization"] = safe_serialization
+        self.model.save_pretrained(save_directory, **kwargs)
+
+        if self.tokenizer is not None:
+            self.tokenizer.save_pretrained(save_directory, **kwargs)
+
+        if self.feature_extractor is not None:
+            self.feature_extractor.save_pretrained(save_directory, **kwargs)
+
+        if self.image_processor is not None:
+            self.image_processor.save_pretrained(save_directory, **kwargs)
+
+        if self.modelcard is not None:
+            self.modelcard.save_pretrained(save_directory)
+
+    def transform(self, X):
+        """
+        Scikit / Keras interface to transformers' pipelines. This method will forward to __call__().
+        """
+        return self(X)
+
+    def predict(self, X):
+        """
+        Scikit / Keras interface to transformers' pipelines. This method will forward to __call__().
+        """
+        return self(X)
+
+    @property
+    def dtype(self) -> Optional["torch.dtype"]:
+        """
+        Dtype of the model (if it's Pytorch model), `None` otherwise.
+        """
+        return getattr(self.model, "dtype", None)
+
+    @property
+    def torch_dtype(self) -> Optional["torch.dtype"]:
+        """
+        Torch dtype of the model (if it's Pytorch model), `None` otherwise.
+        """
+        logger.warning_once("`torch_dtype` attribute is deprecated. Use `dtype` instead!")
+        return getattr(self.model, "dtype", None)
+
+    @contextmanager
+    def device_placement(self):
+        """
+        Context Manager allowing tensor allocation on the user-specified device in framework agnostic way.
+
+        Returns:
+            Context manager
+
+        Examples:
+
+        ```python
+        # Explicitly ask for tensor allocation on CUDA device :0
+        pipe = pipeline(..., device=0)
+        with pipe.device_placement():
+            # Every framework specific tensor allocation will be done on the request device
+            output = pipe(...)
+        ```"""
+        if self.framework == "tf":
+            with tf.device("/CPU:0" if self.device == -1 else f"/device:GPU:{self.device}"):
+                yield
+        else:
+            if self.device.type == "cuda":
+                with torch.cuda.device(self.device):
+                    yield
+            elif self.device.type == "mlu":
+                with torch.mlu.device(self.device):
+                    yield
+            elif self.device.type == "musa":
+                with torch.musa.device(self.device):
+                    yield
+            elif self.device.type == "xpu":
+                with torch.xpu.device(self.device):
+                    yield
+            else:
+                yield
+
+    def ensure_tensor_on_device(self, **inputs):
+        """
+        Ensure PyTorch tensors are on the specified device.
+
+        Args:
+            inputs (keyword arguments that should be `torch.Tensor`, the rest is ignored):
+                The tensors to place on `self.device`.
+            Recursive on lists **only**.
+
+        Return:
+            `dict[str, torch.Tensor]`: The same as `inputs` but on the proper device.
+        """
+        return self._ensure_tensor_on_device(inputs, self.device)
+
+    def _ensure_tensor_on_device(self, inputs, device):
+        if isinstance(inputs, ModelOutput):
+            return ModelOutput(
+                {name: self._ensure_tensor_on_device(tensor, device) for name, tensor in inputs.items()}
+            )
+        elif isinstance(inputs, dict):
+            return {name: self._ensure_tensor_on_device(tensor, device) for name, tensor in inputs.items()}
+        elif isinstance(inputs, UserDict):
+            return UserDict({name: self._ensure_tensor_on_device(tensor, device) for name, tensor in inputs.items()})
+        elif isinstance(inputs, list):
+            return [self._ensure_tensor_on_device(item, device) for item in inputs]
+        elif isinstance(inputs, tuple):
+            return tuple(self._ensure_tensor_on_device(item, device) for item in inputs)
+        elif isinstance(inputs, torch.Tensor):
+            return inputs.to(device)
+        else:
+            return inputs
+
+    def check_model_type(self, supported_models: Union[list[str], dict]):
+        """
+        Check if the model class is in supported by the pipeline.
+
+        Args:
+            supported_models (`list[str]` or `dict`):
+                The list of models supported by the pipeline, or a dictionary with model class values.
+        """
+        if not isinstance(supported_models, list):  # Create from a model mapping
+            supported_models_names = []
+            if self.task in SUPPORTED_PEFT_TASKS:
+                supported_models_names.extend(SUPPORTED_PEFT_TASKS[self.task])
+
+            for model_name in supported_models.values():
+                # Mapping can now contain tuples of models for the same configuration.
+                if isinstance(model_name, tuple):
+                    supported_models_names.extend(list(model_name))
+                else:
+                    supported_models_names.append(model_name)
+            if hasattr(supported_models, "_model_mapping"):
+                for model in supported_models._model_mapping._extra_content.values():
+                    if isinstance(model_name, tuple):
+                        supported_models_names.extend([m.__name__ for m in model])
+                    else:
+                        supported_models_names.append(model.__name__)
+            supported_models = supported_models_names
+        if self.model.__class__.__name__ not in supported_models:
+            logger.error(
+                f"The model '{self.model.__class__.__name__}' is not supported for {self.task}. Supported models are"
+                f" {supported_models}."
+            )
+
+    @abstractmethod
+    def _sanitize_parameters(self, **pipeline_parameters):
+        """
+        _sanitize_parameters will be called with any excessive named arguments from either `__init__` or `__call__`
+        methods. It should return 3 dictionaries of the resolved parameters used by the various `preprocess`,
+        `forward` and `postprocess` methods. Do not fill dictionaries if the caller didn't specify a kwargs. This
+        lets you keep defaults in function signatures, which is more "natural".
+
+        It is not meant to be called directly, it will be automatically called and the final parameters resolved by
+        `__init__` and `__call__`
+        """
+        raise NotImplementedError("_sanitize_parameters not implemented")
+
+    @abstractmethod
+    def preprocess(self, input_: Any, **preprocess_parameters: dict) -> dict[str, GenericTensor]:
+        """
+        Preprocess will take the `input_` of a specific pipeline and return a dictionary of everything necessary for
+        `_forward` to run properly. It should contain at least one tensor, but might have arbitrary other items.
+        """
+        raise NotImplementedError("preprocess not implemented")
+
+    @abstractmethod
+    def _forward(self, input_tensors: dict[str, GenericTensor], **forward_parameters: dict) -> ModelOutput:
+        """
+        _forward will receive the prepared dictionary from `preprocess` and run it on the model. This method might
+        involve the GPU or the CPU and should be agnostic to it. Isolating this function is the reason for `preprocess`
+        and `postprocess` to exist, so that the hot path, this method generally can run as fast as possible.
+
+        It is not meant to be called directly, `forward` is preferred. It is basically the same but contains additional
+        code surrounding `_forward` making sure tensors and models are on the same device, disabling the training part
+        of the code (leading to faster inference).
+        """
+        raise NotImplementedError("_forward not implemented")
+
+    @abstractmethod
+    def postprocess(self, model_outputs: ModelOutput, **postprocess_parameters: dict) -> Any:
+        """
+        Postprocess will receive the raw outputs of the `_forward` method, generally tensors, and reformat them into
+        something more friendly. Generally it will output a list or a dict or results (containing just strings and
+        numbers).
+        """
+        raise NotImplementedError("postprocess not implemented")
+
+    def get_inference_context(self):
+        return torch.no_grad
+
+    def forward(self, model_inputs, **forward_params):
+        with self.device_placement():
+            if self.framework == "tf":
+                model_inputs["training"] = False
+                model_outputs = self._forward(model_inputs, **forward_params)
+            elif self.framework == "pt":
+                inference_context = self.get_inference_context()
+                with inference_context():
+                    model_inputs = self._ensure_tensor_on_device(model_inputs, device=self.device)
+                    model_outputs = self._forward(model_inputs, **forward_params)
+                    model_outputs = self._ensure_tensor_on_device(model_outputs, device=torch.device("cpu"))
+            else:
+                raise ValueError(f"Framework {self.framework} is not supported")
+        return model_outputs
+
+    def get_iterator(
+        self, inputs, num_workers: int, batch_size: int, preprocess_params, forward_params, postprocess_params
+    ):
+        if isinstance(inputs, collections.abc.Sized):
+            dataset = PipelineDataset(inputs, self.preprocess, preprocess_params)
+        else:
+            if num_workers > 1:
+                logger.warning(
+                    "For iterable dataset using num_workers>1 is likely to result"
+                    " in errors since everything is iterable, setting `num_workers=1`"
+                    " to guarantee correctness."
+                )
+                num_workers = 1
+            dataset = PipelineIterator(inputs, self.preprocess, preprocess_params)
+        if "TOKENIZERS_PARALLELISM" not in os.environ:
+            logger.info("Disabling tokenizer parallelism, we're using DataLoader multithreading already")
+            os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        # TODO hack by collating feature_extractor and image_processor
+        feature_extractor = self.feature_extractor if self.feature_extractor is not None else self.image_processor
+        collate_fn = no_collate_fn if batch_size == 1 else pad_collate_fn(self.tokenizer, feature_extractor)
+        dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, collate_fn=collate_fn)
+        model_iterator = PipelineIterator(dataloader, self.forward, forward_params, loader_batch_size=batch_size)
+        final_iterator = PipelineIterator(model_iterator, self.postprocess, postprocess_params)
+        return final_iterator
+
+    def __call__(self, inputs, *args, num_workers=None, batch_size=None, **kwargs):
+        if args:
+            logger.warning(f"Ignoring args : {args}")
+
+        if num_workers is None:
+            if self._num_workers is None:
+                num_workers = 0
+            else:
+                num_workers = self._num_workers
+        if batch_size is None:
+            if self._batch_size is None:
+                batch_size = 1
+            else:
+                batch_size = self._batch_size
+
+        preprocess_params, forward_params, postprocess_params = self._sanitize_parameters(**kwargs)
+
+        # Fuse __init__ params and __call__ params without modifying the __init__ ones.
+        preprocess_params = {**self._preprocess_params, **preprocess_params}
+        forward_params = {**self._forward_params, **forward_params}
+        postprocess_params = {**self._postprocess_params, **postprocess_params}
+
+        self.call_count += 1
+        if self.call_count > 10 and self.framework == "pt" and self.device.type == "cuda":
+            logger.warning_once(
+                "You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a"
+                " dataset",
+            )
+
+        is_dataset = Dataset is not None and isinstance(inputs, Dataset)
+        is_generator = isinstance(inputs, types.GeneratorType)
+        is_list = isinstance(inputs, list)
+
+        is_iterable = is_dataset or is_generator or is_list
+
+        # TODO make the get_iterator work also for `tf` (and `flax`).
+        can_use_iterator = self.framework == "pt" and (is_dataset or is_generator or is_list)
+
+        if is_list:
+            if can_use_iterator:
+                final_iterator = self.get_iterator(
+                    inputs, num_workers, batch_size, preprocess_params, forward_params, postprocess_params
+                )
+                outputs = list(final_iterator)
+                return outputs
+            else:
+                return self.run_multi(inputs, preprocess_params, forward_params, postprocess_params)
+        elif can_use_iterator:
+            return self.get_iterator(
+                inputs, num_workers, batch_size, preprocess_params, forward_params, postprocess_params
+            )
+        elif is_iterable:
+            return self.iterate(inputs, preprocess_params, forward_params, postprocess_params)
+        elif self.framework == "pt" and isinstance(self, ChunkPipeline):
+            return next(
+                iter(
+                    self.get_iterator(
+                        [inputs], num_workers, batch_size, preprocess_params, forward_params, postprocess_params
+                    )
+                )
+            )
+        else:
+            return self.run_single(inputs, preprocess_params, forward_params, postprocess_params)
+
+    def run_multi(self, inputs, preprocess_params, forward_params, postprocess_params):
+        return [self.run_single(item, preprocess_params, forward_params, postprocess_params) for item in inputs]
+
+    def run_single(self, inputs, preprocess_params, forward_params, postprocess_params):
+        model_inputs = self.preprocess(inputs, **preprocess_params)
+        model_outputs = self.forward(model_inputs, **forward_params)
+        outputs = self.postprocess(model_outputs, **postprocess_params)
+        return outputs
+
+    def iterate(self, inputs, preprocess_params, forward_params, postprocess_params):
+        # This function should become `get_iterator` again, this is a temporary
+        # easy solution.
+        for input_ in inputs:
+            yield self.run_single(input_, preprocess_params, forward_params, postprocess_params)
+
+
+Pipeline.push_to_hub = copy_func(Pipeline.push_to_hub)
+if Pipeline.push_to_hub.__doc__ is not None:
+    Pipeline.push_to_hub.__doc__ = Pipeline.push_to_hub.__doc__.format(
+        object="pipe", object_class="pipeline", object_files="pipeline file"
+    ).replace(".from_pretrained", "")
+
+
+class ChunkPipeline(Pipeline):
+    def run_single(self, inputs, preprocess_params, forward_params, postprocess_params):
+        all_outputs = []
+        for model_inputs in self.preprocess(inputs, **preprocess_params):
+            model_outputs = self.forward(model_inputs, **forward_params)
+            all_outputs.append(model_outputs)
+        outputs = self.postprocess(all_outputs, **postprocess_params)
+        return outputs
+
+    def get_iterator(
+        self, inputs, num_workers: int, batch_size: int, preprocess_params, forward_params, postprocess_params
+    ):
+        if "TOKENIZERS_PARALLELISM" not in os.environ:
+            logger.info("Disabling tokenizer parallelism, we're using DataLoader multithreading already")
+            os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        if num_workers > 1:
+            logger.warning(
+                "For ChunkPipeline using num_workers>0 is likely to result in errors since everything is iterable,"
+                " setting `num_workers=1` to guarantee correctness."
+            )
+            num_workers = 1
+        dataset = PipelineChunkIterator(inputs, self.preprocess, preprocess_params)
+
+        # TODO hack by collating feature_extractor and image_processor
+        feature_extractor = self.feature_extractor if self.feature_extractor is not None else self.image_processor
+        collate_fn = no_collate_fn if batch_size == 1 else pad_collate_fn(self.tokenizer, feature_extractor)
+        dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, collate_fn=collate_fn)
+        model_iterator = PipelinePackIterator(dataloader, self.forward, forward_params, loader_batch_size=batch_size)
+        final_iterator = PipelineIterator(model_iterator, self.postprocess, postprocess_params)
+        return final_iterator
+
+
+class PipelineRegistry:
+    def __init__(self, supported_tasks: dict[str, Any], task_aliases: dict[str, str]) -> None:
+        self.supported_tasks = supported_tasks
+        self.task_aliases = task_aliases
+
+    def get_supported_tasks(self) -> list[str]:
+        supported_task = list(self.supported_tasks.keys()) + list(self.task_aliases.keys())
+        supported_task.sort()
+        return supported_task
+
+    def check_task(self, task: str) -> tuple[str, dict, Any]:
+        if task in self.task_aliases:
+            task = self.task_aliases[task]
+        if task in self.supported_tasks:
+            targeted_task = self.supported_tasks[task]
+            return task, targeted_task, None
+
+        if task.startswith("translation"):
+            tokens = task.split("_")
+            if len(tokens) == 4 and tokens[0] == "translation" and tokens[2] == "to":
+                targeted_task = self.supported_tasks["translation"]
+                task = "translation"
+                return task, targeted_task, (tokens[1], tokens[3])
+            raise KeyError(f"Invalid translation task {task}, use 'translation_XX_to_YY' format")
+
+        raise KeyError(
+            f"Unknown task {task}, available tasks are {self.get_supported_tasks() + ['translation_XX_to_YY']}"
+        )
+
+    @deprecate_kwarg(old_name="tf_model", version="5.0.0")
+    def register_pipeline(
+        self,
+        task: str,
+        pipeline_class: type,
+        pt_model: Optional[Union[type, tuple[type]]] = None,
+        tf_model: Optional[Union[type, tuple[type]]] = None,
+        default: Optional[dict] = None,
+        type: Optional[str] = None,
+    ) -> None:
+        if task in self.supported_tasks:
+            logger.warning(f"{task} is already registered. Overwriting pipeline for task {task}...")
+
+        if pt_model is None:
+            pt_model = ()
+        elif not isinstance(pt_model, tuple):
+            pt_model = (pt_model,)
+
+        if tf_model is None:
+            tf_model = ()
+        elif not isinstance(tf_model, tuple):
+            tf_model = (tf_model,)
+
+        task_impl = {"impl": pipeline_class, "pt": pt_model, "tf": tf_model}
+
+        if default is not None:
+            if "model" not in default and ("pt" in default or "tf" in default):
+                default = {"model": default}
+            task_impl["default"] = default
+
+        if type is not None:
+            task_impl["type"] = type
+
+        self.supported_tasks[task] = task_impl
+        pipeline_class._registered_impl = {task: task_impl}
+
+    def to_dict(self):
+        return self.supported_tasks
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/depth_estimation.py b/venv/lib/python3.13/site-packages/transformers/pipelines/depth_estimation.py
new file mode 100644
index 0000000000000000000000000000000000000000..588cee770639cc25a7720edc3862f1ec07351a08
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/depth_estimation.py
@@ -0,0 +1,146 @@
+from typing import Any, Union, overload
+
+from ..utils import (
+    add_end_docstrings,
+    is_torch_available,
+    is_vision_available,
+    logging,
+    requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
+class DepthEstimationPipeline(Pipeline):
+    """
+    Depth estimation pipeline using any `AutoModelForDepthEstimation`. This pipeline predicts the depth of an image.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> depth_estimator = pipeline(task="depth-estimation", model="LiheYoung/depth-anything-base-hf")
+    >>> output = depth_estimator("http://images.cocodataset.org/val2017/000000039769.jpg")
+    >>> # This is a tensor with the values being the depth expressed in meters for each pixel
+    >>> output["predicted_depth"].shape
+    torch.Size([1, 384, 384])
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+
+    This depth estimation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"depth-estimation"`.
+
+    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=depth-estimation).
+    """
+
+    _load_processor = False
+    _load_image_processor = True
+    _load_feature_extractor = False
+    _load_tokenizer = False
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        requires_backends(self, "vision")
+        self.check_model_type(MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES)
+
+    @overload
+    def __call__(self, inputs: Union[str, "Image.Image"], **kwargs: Any) -> dict[str, Any]: ...
+
+    @overload
+    def __call__(self, inputs: list[Union[str, "Image.Image"]], **kwargs: Any) -> list[dict[str, Any]]: ...
+
+    def __call__(
+        self, inputs: Union[str, list[str], "Image.Image", list["Image.Image"]], **kwargs: Any
+    ) -> Union[dict[str, Any], list[dict[str, Any]]]:
+        """
+        Predict the depth(s) of the image(s) passed as inputs.
+
+        Args:
+            inputs (`str`, `list[str]`, `PIL.Image` or `list[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing a http link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
+                Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
+                images.
+            parameters (`Dict`, *optional*):
+                A dictionary of argument names to parameter values, to control pipeline behaviour.
+                The only parameter available right now is `timeout`, which is the length of time, in seconds,
+                that the pipeline should wait before giving up on trying to download an image.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
+        Return:
+            A dictionary or a list of dictionaries containing result. If the input is a single image, will return a
+            dictionary, if the input is a list of several images, will return a list of dictionaries corresponding to
+            the images.
+
+            The dictionaries contain the following keys:
+
+            - **predicted_depth** (`torch.Tensor`) -- The predicted depth by the model as a `torch.Tensor`.
+            - **depth** (`PIL.Image`) -- The predicted depth by the model as a `PIL.Image`.
+        """
+        # After deprecation of this is completed, remove the default `None` value for `images`
+        if "images" in kwargs:
+            inputs = kwargs.pop("images")
+        if inputs is None:
+            raise ValueError("Cannot call the depth-estimation pipeline without an inputs argument!")
+        return super().__call__(inputs, **kwargs)
+
+    def _sanitize_parameters(self, timeout=None, parameters=None, **kwargs):
+        preprocess_params = {}
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
+        if isinstance(parameters, dict) and "timeout" in parameters:
+            preprocess_params["timeout"] = parameters["timeout"]
+        return preprocess_params, {}, {}
+
+    def preprocess(self, image, timeout=None):
+        image = load_image(image, timeout)
+        model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+        if self.framework == "pt":
+            model_inputs = model_inputs.to(self.dtype)
+        model_inputs["target_size"] = image.size[::-1]
+        return model_inputs
+
+    def _forward(self, model_inputs):
+        target_size = model_inputs.pop("target_size")
+        model_outputs = self.model(**model_inputs)
+        model_outputs["target_size"] = target_size
+        return model_outputs
+
+    def postprocess(self, model_outputs):
+        outputs = self.image_processor.post_process_depth_estimation(
+            model_outputs,
+            # this acts as `source_sizes` for ZoeDepth and as `target_sizes` for the rest of the models so do *not*
+            # replace with `target_sizes = [model_outputs["target_size"]]`
+            [model_outputs["target_size"]],
+        )
+
+        formatted_outputs = []
+        for output in outputs:
+            depth = output["predicted_depth"].detach().cpu().numpy()
+            depth = (depth - depth.min()) / (depth.max() - depth.min())
+            depth = Image.fromarray((depth * 255).astype("uint8"))
+
+            formatted_outputs.append({"predicted_depth": output["predicted_depth"], "depth": depth})
+
+        return formatted_outputs[0] if len(outputs) == 1 else formatted_outputs
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/document_question_answering.py b/venv/lib/python3.13/site-packages/transformers/pipelines/document_question_answering.py
new file mode 100644
index 0000000000000000000000000000000000000000..d45e756b0f3f0a6cd140edd67396e569320f3aa5
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/document_question_answering.py
@@ -0,0 +1,551 @@
+# Copyright 2022 The Impira Team and the HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from typing import Any, Optional, Union, overload
+
+import numpy as np
+
+from ..generation import GenerationConfig
+from ..utils import (
+    ExplicitEnum,
+    add_end_docstrings,
+    is_pytesseract_available,
+    is_torch_available,
+    is_vision_available,
+    logging,
+)
+from .base import ChunkPipeline, build_pipeline_init_args
+from .question_answering import select_starts_ends
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES
+
+TESSERACT_LOADED = False
+if is_pytesseract_available():
+    TESSERACT_LOADED = True
+    import pytesseract
+
+logger = logging.get_logger(__name__)
+
+
+# normalize_bbox() and apply_tesseract() are derived from apply_tesseract in models/layoutlmv3/feature_extraction_layoutlmv3.py.
+# However, because the pipeline may evolve from what layoutlmv3 currently does, it's copied (vs. imported) to avoid creating an
+# unnecessary dependency.
+def normalize_box(box, width, height):
+    return [
+        int(1000 * (box[0] / width)),
+        int(1000 * (box[1] / height)),
+        int(1000 * (box[2] / width)),
+        int(1000 * (box[3] / height)),
+    ]
+
+
+def apply_tesseract(image: "Image.Image", lang: Optional[str], tesseract_config: Optional[str]):
+    """Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes."""
+    # apply OCR
+    data = pytesseract.image_to_data(image, lang=lang, output_type="dict", config=tesseract_config)
+    words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
+
+    # filter empty words and corresponding coordinates
+    irrelevant_indices = [idx for idx, word in enumerate(words) if not word.strip()]
+    words = [word for idx, word in enumerate(words) if idx not in irrelevant_indices]
+    left = [coord for idx, coord in enumerate(left) if idx not in irrelevant_indices]
+    top = [coord for idx, coord in enumerate(top) if idx not in irrelevant_indices]
+    width = [coord for idx, coord in enumerate(width) if idx not in irrelevant_indices]
+    height = [coord for idx, coord in enumerate(height) if idx not in irrelevant_indices]
+
+    # turn coordinates into (left, top, left+width, top+height) format
+    actual_boxes = []
+    for x, y, w, h in zip(left, top, width, height):
+        actual_box = [x, y, x + w, y + h]
+        actual_boxes.append(actual_box)
+
+    image_width, image_height = image.size
+
+    # finally, normalize the bounding boxes
+    normalized_boxes = []
+    for box in actual_boxes:
+        normalized_boxes.append(normalize_box(box, image_width, image_height))
+
+    if len(words) != len(normalized_boxes):
+        raise ValueError("Not as many words as there are bounding boxes")
+
+    return words, normalized_boxes
+
+
+class ModelType(ExplicitEnum):
+    LayoutLM = "layoutlm"
+    LayoutLMv2andv3 = "layoutlmv2andv3"
+    VisionEncoderDecoder = "vision_encoder_decoder"
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True, has_tokenizer=True))
+class DocumentQuestionAnsweringPipeline(ChunkPipeline):
+    # TODO: Update task_summary docs to include an example with document QA and then update the first sentence
+    """
+    Document Question Answering pipeline using any `AutoModelForDocumentQuestionAnswering`. The inputs/outputs are
+    similar to the (extractive) question answering pipeline; however, the pipeline takes an image (and optional OCR'd
+    words/boxes) as input instead of text context.
+
+    Unless the model you're using explicitly sets these generation parameters in its configuration files
+    (`generation_config.json`), the following default values will be used:
+    - max_new_tokens: 256
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> document_qa = pipeline(model="impira/layoutlm-document-qa")
+    >>> document_qa(
+    ...     image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png",
+    ...     question="What is the invoice number?",
+    ... )
+    [{'score': 0.425, 'answer': 'us-001', 'start': 16, 'end': 16}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This document question answering pipeline can currently be loaded from [`pipeline`] using the following task
+    identifier: `"document-question-answering"`.
+
+    The models that this pipeline can use are models that have been fine-tuned on a document question answering task.
+    See the up-to-date list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=document-question-answering).
+    """
+
+    _pipeline_calls_generate = True
+    _load_processor = False
+    _load_image_processor = None
+    _load_feature_extractor = None
+    _load_tokenizer = True
+    # Make sure the docstring is updated when the default generation config is changed
+    _default_generation_config = GenerationConfig(
+        max_new_tokens=256,
+    )
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        if self.tokenizer is not None and not self.tokenizer.__class__.__name__.endswith("Fast"):
+            raise ValueError(
+                "`DocumentQuestionAnsweringPipeline` requires a fast tokenizer, but a slow tokenizer "
+                f"(`{self.tokenizer.__class__.__name__}`) is provided."
+            )
+
+        if self.model.config.__class__.__name__ == "VisionEncoderDecoderConfig":
+            self.model_type = ModelType.VisionEncoderDecoder
+            if self.model.config.encoder.model_type != "donut-swin":
+                raise ValueError("Currently, the only supported VisionEncoderDecoder model is Donut")
+        else:
+            self.check_model_type(MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES)
+            if self.model.config.__class__.__name__ == "LayoutLMConfig":
+                self.model_type = ModelType.LayoutLM
+            else:
+                self.model_type = ModelType.LayoutLMv2andv3
+
+    def _sanitize_parameters(
+        self,
+        padding=None,
+        doc_stride=None,
+        max_question_len=None,
+        lang: Optional[str] = None,
+        tesseract_config: Optional[str] = None,
+        max_answer_len=None,
+        max_seq_len=None,
+        top_k=None,
+        handle_impossible_answer=None,
+        timeout=None,
+        **kwargs,
+    ):
+        preprocess_params, postprocess_params = {}, {}
+        if padding is not None:
+            preprocess_params["padding"] = padding
+        if doc_stride is not None:
+            preprocess_params["doc_stride"] = doc_stride
+        if max_question_len is not None:
+            preprocess_params["max_question_len"] = max_question_len
+        if max_seq_len is not None:
+            preprocess_params["max_seq_len"] = max_seq_len
+        if lang is not None:
+            preprocess_params["lang"] = lang
+        if tesseract_config is not None:
+            preprocess_params["tesseract_config"] = tesseract_config
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
+
+        if top_k is not None:
+            if top_k < 1:
+                raise ValueError(f"top_k parameter should be >= 1 (got {top_k})")
+            postprocess_params["top_k"] = top_k
+        if max_answer_len is not None:
+            if max_answer_len < 1:
+                raise ValueError(f"max_answer_len parameter should be >= 1 (got {max_answer_len}")
+            postprocess_params["max_answer_len"] = max_answer_len
+        if handle_impossible_answer is not None:
+            postprocess_params["handle_impossible_answer"] = handle_impossible_answer
+
+        forward_params = {}
+        if getattr(self, "assistant_model", None) is not None:
+            forward_params["assistant_model"] = self.assistant_model
+        if getattr(self, "assistant_tokenizer", None) is not None:
+            forward_params["tokenizer"] = self.tokenizer
+            forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+        return preprocess_params, forward_params, postprocess_params
+
+    @overload
+    def __call__(
+        self,
+        image: Union["Image.Image", str],
+        question: str,
+        word_boxes: Optional[tuple[str, list[float]]] = None,
+        **kwargs: Any,
+    ) -> list[dict[str, Any]]: ...
+
+    @overload
+    def __call__(self, image: dict[str, Any], **kwargs: Any) -> list[dict[str, Any]]: ...
+
+    @overload
+    def __call__(self, image: list[dict[str, Any]], **kwargs: Any) -> list[list[dict[str, Any]]]: ...
+
+    def __call__(
+        self,
+        image: Union["Image.Image", str, list[dict[str, Any]]],
+        question: Optional[str] = None,
+        word_boxes: Optional[tuple[str, list[float]]] = None,
+        **kwargs: Any,
+    ) -> Union[dict[str, Any], list[dict[str, Any]]]:
+        """
+        Answer the question(s) given as inputs by using the document(s). A document is defined as an image and an
+        optional list of (word, box) tuples which represent the text in the document. If the `word_boxes` are not
+        provided, it will use the Tesseract OCR engine (if available) to extract the words and boxes automatically for
+        LayoutLM-like models which require them as input. For Donut, no OCR is run.
+
+        You can invoke the pipeline several ways:
+
+        - `pipeline(image=image, question=question)`
+        - `pipeline(image=image, question=question, word_boxes=word_boxes)`
+        - `pipeline([{"image": image, "question": question}])`
+        - `pipeline([{"image": image, "question": question, "word_boxes": word_boxes}])`
+
+        Args:
+            image (`str` or `PIL.Image`):
+                The pipeline handles three types of images:
+
+                - A string containing a http link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images. If given a single image, it can be
+                broadcasted to multiple questions.
+            question (`str`):
+                A question to ask of the document.
+            word_boxes (`list[str, tuple[float, float, float, float]]`, *optional*):
+                A list of words and bounding boxes (normalized 0->1000). If you provide this optional input, then the
+                pipeline will use these words and boxes instead of running OCR on the image to derive them for models
+                that need them (e.g. LayoutLM). This allows you to reuse OCR'd results across many invocations of the
+                pipeline without having to re-run it each time.
+            top_k (`int`, *optional*, defaults to 1):
+                The number of answers to return (will be chosen by order of likelihood). Note that we return less than
+                top_k answers if there are not enough options available within the context.
+            doc_stride (`int`, *optional*, defaults to 128):
+                If the words in the document are too long to fit with the question for the model, it will be split in
+                several chunks with some overlap. This argument controls the size of that overlap.
+            max_answer_len (`int`, *optional*, defaults to 15):
+                The maximum length of predicted answers (e.g., only answers with a shorter length are considered).
+            max_seq_len (`int`, *optional*, defaults to 384):
+                The maximum length of the total sentence (context + question) in tokens of each chunk passed to the
+                model. The context will be split in several chunks (using `doc_stride` as overlap) if needed.
+            max_question_len (`int`, *optional*, defaults to 64):
+                The maximum length of the question after tokenization. It will be truncated if needed.
+            handle_impossible_answer (`bool`, *optional*, defaults to `False`):
+                Whether or not we accept impossible as an answer.
+            lang (`str`, *optional*):
+                Language to use while running OCR. Defaults to english.
+            tesseract_config (`str`, *optional*):
+                Additional flags to pass to tesseract while running OCR.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
+        Return:
+            A `dict` or a list of `dict`: Each result comes as a dictionary with the following keys:
+
+            - **score** (`float`) -- The probability associated to the answer.
+            - **start** (`int`) -- The start word index of the answer (in the OCR'd version of the input or provided
+              `word_boxes`).
+            - **end** (`int`) -- The end word index of the answer (in the OCR'd version of the input or provided
+              `word_boxes`).
+            - **answer** (`str`) -- The answer to the question.
+            - **words** (`list[int]`) -- The index of each word/box pair that is in the answer
+        """
+        if isinstance(question, str):
+            inputs = {"question": question, "image": image}
+            if word_boxes is not None:
+                inputs["word_boxes"] = word_boxes
+        else:
+            inputs = image
+        return super().__call__(inputs, **kwargs)
+
+    def preprocess(
+        self,
+        input,
+        padding="do_not_pad",
+        doc_stride=None,
+        max_seq_len=None,
+        word_boxes: Optional[tuple[str, list[float]]] = None,
+        lang=None,
+        tesseract_config="",
+        timeout=None,
+    ):
+        # NOTE: This code mirrors the code in question answering and will be implemented in a follow up PR
+        # to support documents with enough tokens that overflow the model's window
+        if max_seq_len is None:
+            max_seq_len = self.tokenizer.model_max_length
+
+        if doc_stride is None:
+            doc_stride = min(max_seq_len // 2, 256)
+
+        image = None
+        image_features = {}
+        if input.get("image", None) is not None:
+            image = load_image(input["image"], timeout=timeout)
+            if self.image_processor is not None:
+                image_inputs = self.image_processor(images=image, return_tensors=self.framework)
+                if self.framework == "pt":
+                    image_inputs = image_inputs.to(self.dtype)
+                image_features.update(image_inputs)
+            elif self.feature_extractor is not None:
+                image_features.update(self.feature_extractor(images=image, return_tensors=self.framework))
+            elif self.model_type == ModelType.VisionEncoderDecoder:
+                raise ValueError("If you are using a VisionEncoderDecoderModel, you must provide a feature extractor")
+
+        words, boxes = None, None
+        if self.model_type != ModelType.VisionEncoderDecoder:
+            if "word_boxes" in input:
+                words = [x[0] for x in input["word_boxes"]]
+                boxes = [x[1] for x in input["word_boxes"]]
+            elif "words" in image_features and "boxes" in image_features:
+                words = image_features.pop("words")[0]
+                boxes = image_features.pop("boxes")[0]
+            elif image is not None:
+                if not TESSERACT_LOADED:
+                    raise ValueError(
+                        "If you provide an image without word_boxes, then the pipeline will run OCR using Tesseract,"
+                        " but pytesseract is not available"
+                    )
+                if TESSERACT_LOADED:
+                    words, boxes = apply_tesseract(image, lang=lang, tesseract_config=tesseract_config)
+            else:
+                raise ValueError(
+                    "You must provide an image or word_boxes. If you provide an image, the pipeline will automatically"
+                    " run OCR to derive words and boxes"
+                )
+
+        if self.tokenizer.padding_side != "right":
+            raise ValueError(
+                "Document question answering only supports tokenizers whose padding side is 'right', not"
+                f" {self.tokenizer.padding_side}"
+            )
+
+        if self.model_type == ModelType.VisionEncoderDecoder:
+            task_prompt = f"<s_docvqa><s_question>{input['question']}</s_question><s_answer>"
+            # Adapted from https://huggingface.co/spaces/nielsr/donut-docvqa/blob/main/app.py
+            encoding = {
+                "inputs": image_features["pixel_values"],
+                "decoder_input_ids": self.tokenizer(
+                    task_prompt, add_special_tokens=False, return_tensors=self.framework
+                ).input_ids,
+                "return_dict_in_generate": True,
+            }
+            yield {
+                **encoding,
+                "p_mask": None,
+                "word_ids": None,
+                "words": None,
+                "output_attentions": True,
+                "is_last": True,
+            }
+        else:
+            tokenizer_kwargs = {}
+            if self.model_type == ModelType.LayoutLM:
+                tokenizer_kwargs["text"] = input["question"].split()
+                tokenizer_kwargs["text_pair"] = words
+                tokenizer_kwargs["is_split_into_words"] = True
+            else:
+                tokenizer_kwargs["text"] = [input["question"]]
+                tokenizer_kwargs["text_pair"] = [words]
+                tokenizer_kwargs["boxes"] = [boxes]
+
+            encoding = self.tokenizer(
+                padding=padding,
+                max_length=max_seq_len,
+                stride=doc_stride,
+                return_token_type_ids=True,
+                truncation="only_second",
+                return_overflowing_tokens=True,
+                **tokenizer_kwargs,
+            )
+            # TODO: check why slower `LayoutLMTokenizer` and `LayoutLMv2Tokenizer` don't have this key in outputs
+            # FIXME: ydshieh and/or Narsil
+            encoding.pop("overflow_to_sample_mapping", None)  # We do not use this
+
+            num_spans = len(encoding["input_ids"])
+
+            # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
+            # We put 0 on the tokens from the context and 1 everywhere else (question and special tokens)
+            # This logic mirrors the logic in the question_answering pipeline
+            p_mask = [[tok != 1 for tok in encoding.sequence_ids(span_id)] for span_id in range(num_spans)]
+            for span_idx in range(num_spans):
+                if self.framework == "pt":
+                    span_encoding = {k: torch.tensor(v[span_idx : span_idx + 1]) for (k, v) in encoding.items()}
+                    if "pixel_values" in image_features:
+                        span_encoding["image"] = image_features["pixel_values"]
+                else:
+                    raise ValueError("Unsupported: Tensorflow preprocessing for DocumentQuestionAnsweringPipeline")
+
+                input_ids_span_idx = encoding["input_ids"][span_idx]
+                # keep the cls_token unmasked (some models use it to indicate unanswerable questions)
+                if self.tokenizer.cls_token_id is not None:
+                    cls_indices = np.nonzero(np.array(input_ids_span_idx) == self.tokenizer.cls_token_id)[0]
+                    for cls_index in cls_indices:
+                        p_mask[span_idx][cls_index] = 0
+
+                # For each span, place a bounding box [0,0,0,0] for question and CLS tokens, [1000,1000,1000,1000]
+                # for SEP tokens, and the word's bounding box for words in the original document.
+                if "boxes" not in tokenizer_kwargs:
+                    bbox = []
+                    for input_id, sequence_id, word_id in zip(
+                        encoding.input_ids[span_idx],
+                        encoding.sequence_ids(span_idx),
+                        encoding.word_ids(span_idx),
+                    ):
+                        if sequence_id == 1:
+                            bbox.append(boxes[word_id])
+                        elif input_id == self.tokenizer.sep_token_id:
+                            bbox.append([1000] * 4)
+                        else:
+                            bbox.append([0] * 4)
+
+                    if self.framework == "pt":
+                        span_encoding["bbox"] = torch.tensor(bbox).unsqueeze(0)
+                    elif self.framework == "tf":
+                        raise ValueError("Unsupported: Tensorflow preprocessing for DocumentQuestionAnsweringPipeline")
+                yield {
+                    **span_encoding,
+                    "p_mask": p_mask[span_idx],
+                    "word_ids": encoding.word_ids(span_idx),
+                    "words": words,
+                    "is_last": span_idx == num_spans - 1,
+                }
+
+    def _forward(self, model_inputs, **generate_kwargs):
+        p_mask = model_inputs.pop("p_mask", None)
+        word_ids = model_inputs.pop("word_ids", None)
+        words = model_inputs.pop("words", None)
+        is_last = model_inputs.pop("is_last", False)
+
+        if self.model_type == ModelType.VisionEncoderDecoder:
+            # User-defined `generation_config` passed to the pipeline call take precedence
+            if "generation_config" not in generate_kwargs:
+                generate_kwargs["generation_config"] = self.generation_config
+
+            model_outputs = self.model.generate(**model_inputs, **generate_kwargs)
+        else:
+            model_outputs = self.model(**model_inputs)
+
+        model_outputs = dict(model_outputs.items())
+        model_outputs["p_mask"] = p_mask
+        model_outputs["word_ids"] = word_ids
+        model_outputs["words"] = words
+        model_outputs["attention_mask"] = model_inputs.get("attention_mask", None)
+        model_outputs["is_last"] = is_last
+        return model_outputs
+
+    def postprocess(self, model_outputs, top_k=1, **kwargs):
+        if self.model_type == ModelType.VisionEncoderDecoder:
+            answers = [self.postprocess_encoder_decoder_single(o) for o in model_outputs]
+        else:
+            answers = self.postprocess_extractive_qa(model_outputs, top_k=top_k, **kwargs)
+
+        answers = sorted(answers, key=lambda x: x.get("score", 0), reverse=True)[:top_k]
+        return answers
+
+    def postprocess_encoder_decoder_single(self, model_outputs, **kwargs):
+        sequence = self.tokenizer.batch_decode(model_outputs["sequences"])[0]
+
+        # TODO: A lot of this logic is specific to Donut and should probably be handled in the tokenizer
+        # (see https://github.com/huggingface/transformers/pull/18414/files#r961747408 for more context).
+        sequence = sequence.replace(self.tokenizer.eos_token, "").replace(self.tokenizer.pad_token, "")
+        sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
+        ret = {
+            "answer": None,
+        }
+
+        answer = re.search(r"<s_answer>(.*)</s_answer>", sequence)
+        if answer is not None:
+            ret["answer"] = answer.group(1).strip()
+        return ret
+
+    def postprocess_extractive_qa(
+        self, model_outputs, top_k=1, handle_impossible_answer=False, max_answer_len=15, **kwargs
+    ):
+        min_null_score = 1000000  # large and positive
+        answers = []
+        for output in model_outputs:
+            words = output["words"]
+
+            if self.framework == "pt" and output["start_logits"].dtype in (torch.bfloat16, torch.float16):
+                output["start_logits"] = output["start_logits"].float()
+            if self.framework == "pt" and output["end_logits"].dtype in (torch.bfloat16, torch.float16):
+                output["end_logits"] = output["end_logits"].float()
+
+            starts, ends, scores, min_null_score = select_starts_ends(
+                start=output["start_logits"],
+                end=output["end_logits"],
+                p_mask=output["p_mask"],
+                attention_mask=output["attention_mask"].numpy()
+                if output.get("attention_mask", None) is not None
+                else None,
+                min_null_score=min_null_score,
+                top_k=top_k,
+                handle_impossible_answer=handle_impossible_answer,
+                max_answer_len=max_answer_len,
+            )
+            word_ids = output["word_ids"]
+            for start, end, score in zip(starts, ends, scores):
+                word_start, word_end = word_ids[start], word_ids[end]
+                if word_start is not None and word_end is not None:
+                    answers.append(
+                        {
+                            "score": float(score),
+                            "answer": " ".join(words[word_start : word_end + 1]),
+                            "start": word_start,
+                            "end": word_end,
+                        }
+                    )
+
+        if handle_impossible_answer:
+            answers.append({"score": min_null_score, "answer": "", "start": 0, "end": 0})
+
+        return answers
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/feature_extraction.py b/venv/lib/python3.13/site-packages/transformers/pipelines/feature_extraction.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c8005d05f22b3a0ff5e7b933a46c99a02c0ff49
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/feature_extraction.py
@@ -0,0 +1,91 @@
+from typing import Any, Union
+
+from ..utils import add_end_docstrings
+from .base import GenericTensor, Pipeline, build_pipeline_init_args
+
+
+@add_end_docstrings(
+    build_pipeline_init_args(has_tokenizer=True, supports_binary_output=False),
+    r"""
+        tokenize_kwargs (`dict`, *optional*):
+                Additional dictionary of keyword arguments passed along to the tokenizer.
+        return_tensors (`bool`, *optional*):
+            If `True`, returns a tensor according to the specified framework, otherwise returns a list.""",
+)
+class FeatureExtractionPipeline(Pipeline):
+    """
+    Feature extraction pipeline uses no model head. This pipeline extracts the hidden states from the base
+    transformer, which can be used as features in downstream tasks.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> extractor = pipeline(model="google-bert/bert-base-uncased", task="feature-extraction")
+    >>> result = extractor("This is a simple test.", return_tensors=True)
+    >>> result.shape  # This is a tensor of shape [1, sequence_length, hidden_dimension] representing the input string.
+    torch.Size([1, 8, 768])
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This feature extraction pipeline can currently be loaded from [`pipeline`] using the task identifier:
+    `"feature-extraction"`.
+
+    All models may be used for this pipeline. See a list of all models, including community-contributed models on
+    [huggingface.co/models](https://huggingface.co/models).
+    """
+
+    _load_processor = False
+    _load_image_processor = False
+    _load_feature_extractor = False
+    _load_tokenizer = True
+
+    def _sanitize_parameters(self, truncation=None, tokenize_kwargs=None, return_tensors=None, **kwargs):
+        if tokenize_kwargs is None:
+            tokenize_kwargs = {}
+
+        if truncation is not None:
+            if "truncation" in tokenize_kwargs:
+                raise ValueError(
+                    "truncation parameter defined twice (given as keyword argument as well as in tokenize_kwargs)"
+                )
+            tokenize_kwargs["truncation"] = truncation
+
+        preprocess_params = tokenize_kwargs
+
+        postprocess_params = {}
+        if return_tensors is not None:
+            postprocess_params["return_tensors"] = return_tensors
+
+        return preprocess_params, {}, postprocess_params
+
+    def preprocess(self, inputs, **tokenize_kwargs) -> dict[str, GenericTensor]:
+        model_inputs = self.tokenizer(inputs, return_tensors=self.framework, **tokenize_kwargs)
+        return model_inputs
+
+    def _forward(self, model_inputs):
+        model_outputs = self.model(**model_inputs)
+        return model_outputs
+
+    def postprocess(self, model_outputs, return_tensors=False):
+        # [0] is the first available tensor, logits or last_hidden_state.
+        if return_tensors:
+            return model_outputs[0]
+        if self.framework == "pt":
+            return model_outputs[0].tolist()
+        elif self.framework == "tf":
+            return model_outputs[0].numpy().tolist()
+
+    def __call__(self, *args: Union[str, list[str]], **kwargs: Any) -> Union[Any, list[Any]]:
+        """
+        Extract the features of the input(s) text.
+
+        Args:
+            args (`str` or `list[str]`): One or several texts (or one list of texts) to get the features of.
+
+        Return:
+            A nested list of `float`: The features computed by the model.
+        """
+        return super().__call__(*args, **kwargs)
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/fill_mask.py b/venv/lib/python3.13/site-packages/transformers/pipelines/fill_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9c18acb792da241ddd3a4c8907f81b8fcfa6fb5
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/fill_mask.py
@@ -0,0 +1,286 @@
+from typing import Any, Union, overload
+
+import numpy as np
+
+from ..utils import add_end_docstrings, is_tf_available, is_torch_available, logging
+from .base import GenericTensor, Pipeline, PipelineException, build_pipeline_init_args
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from ..tf_utils import stable_softmax
+
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(
+    build_pipeline_init_args(has_tokenizer=True),
+    r"""
+        top_k (`int`, *optional*, defaults to 5):
+            The number of predictions to return.
+        targets (`str` or `list[str]`, *optional*):
+            When passed, the model will limit the scores to the passed targets instead of looking up in the whole
+            vocab. If the provided targets are not in the model vocab, they will be tokenized and the first resulting
+            token will be used (with a warning, and that might be slower).
+        tokenizer_kwargs (`dict`, *optional*):
+            Additional dictionary of keyword arguments passed along to the tokenizer.""",
+)
+class FillMaskPipeline(Pipeline):
+    _load_processor = False
+    _load_image_processor = False
+    _load_feature_extractor = False
+    _load_tokenizer = True
+
+    """
+    Masked language modeling prediction pipeline using any `ModelWithLMHead`. See the [masked language modeling
+    examples](../task_summary#masked-language-modeling) for more information.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> fill_masker = pipeline(model="google-bert/bert-base-uncased")
+    >>> fill_masker("This is a simple [MASK].")
+    [{'score': 0.042, 'token': 3291, 'token_str': 'problem', 'sequence': 'this is a simple problem.'}, {'score': 0.031, 'token': 3160, 'token_str': 'question', 'sequence': 'this is a simple question.'}, {'score': 0.03, 'token': 8522, 'token_str': 'equation', 'sequence': 'this is a simple equation.'}, {'score': 0.027, 'token': 2028, 'token_str': 'one', 'sequence': 'this is a simple one.'}, {'score': 0.024, 'token': 3627, 'token_str': 'rule', 'sequence': 'this is a simple rule.'}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This mask filling pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"fill-mask"`.
+
+    The models that this pipeline can use are models that have been trained with a masked language modeling objective,
+    which includes the bi-directional models in the library. See the up-to-date list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=fill-mask).
+
+    <Tip>
+
+    This pipeline only works for inputs with exactly one token masked. Experimental: We added support for multiple
+    masks. The returned values are raw model output, and correspond to disjoint probabilities where one might expect
+    joint probabilities (See [discussion](https://github.com/huggingface/transformers/pull/10222)).
+
+    </Tip>
+
+    <Tip>
+
+    This pipeline now supports tokenizer_kwargs. For example try:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> fill_masker = pipeline(model="google-bert/bert-base-uncased")
+    >>> tokenizer_kwargs = {"truncation": True}
+    >>> fill_masker(
+    ...     "This is a simple [MASK]. " + "...with a large amount of repeated text appended. " * 100,
+    ...     tokenizer_kwargs=tokenizer_kwargs,
+    ... )
+    ```
+
+
+    </Tip>
+
+
+    """
+
+    def get_masked_index(self, input_ids: GenericTensor) -> np.ndarray:
+        if self.framework == "tf":
+            masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy()
+        elif self.framework == "pt":
+            masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False)
+        else:
+            raise ValueError("Unsupported framework")
+        return masked_index
+
+    def _ensure_exactly_one_mask_token(self, input_ids: GenericTensor) -> np.ndarray:
+        masked_index = self.get_masked_index(input_ids)
+        numel = np.prod(masked_index.shape)
+        if numel < 1:
+            raise PipelineException(
+                "fill-mask",
+                self.model.base_model_prefix,
+                f"No mask_token ({self.tokenizer.mask_token}) found on the input",
+            )
+
+    def ensure_exactly_one_mask_token(self, model_inputs: GenericTensor):
+        if isinstance(model_inputs, list):
+            for model_input in model_inputs:
+                self._ensure_exactly_one_mask_token(model_input["input_ids"][0])
+        else:
+            for input_ids in model_inputs["input_ids"]:
+                self._ensure_exactly_one_mask_token(input_ids)
+
+    def preprocess(
+        self, inputs, return_tensors=None, tokenizer_kwargs=None, **preprocess_parameters
+    ) -> dict[str, GenericTensor]:
+        if return_tensors is None:
+            return_tensors = self.framework
+        if tokenizer_kwargs is None:
+            tokenizer_kwargs = {}
+
+        model_inputs = self.tokenizer(inputs, return_tensors=return_tensors, **tokenizer_kwargs)
+        self.ensure_exactly_one_mask_token(model_inputs)
+        return model_inputs
+
+    def _forward(self, model_inputs):
+        model_outputs = self.model(**model_inputs)
+        model_outputs["input_ids"] = model_inputs["input_ids"]
+        return model_outputs
+
+    def postprocess(self, model_outputs, top_k=5, target_ids=None):
+        # Cap top_k if there are targets
+        if target_ids is not None and target_ids.shape[0] < top_k:
+            top_k = target_ids.shape[0]
+        input_ids = model_outputs["input_ids"][0]
+        outputs = model_outputs["logits"]
+
+        if self.framework == "tf":
+            masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy()[:, 0]
+
+            outputs = outputs.numpy()
+
+            logits = outputs[0, masked_index, :]
+            probs = stable_softmax(logits, axis=-1)
+            if target_ids is not None:
+                probs = tf.gather_nd(tf.squeeze(probs, 0), target_ids.reshape(-1, 1))
+                probs = tf.expand_dims(probs, 0)
+
+            topk = tf.math.top_k(probs, k=top_k)
+            values, predictions = topk.values.numpy(), topk.indices.numpy()
+        else:
+            masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False).squeeze(-1)
+            # Fill mask pipeline supports only one ${mask_token} per sample
+
+            logits = outputs[0, masked_index, :]
+            probs = logits.softmax(dim=-1)
+            if target_ids is not None:
+                probs = probs[..., target_ids]
+
+            values, predictions = probs.topk(top_k)
+
+        result = []
+        single_mask = values.shape[0] == 1
+        for i, (_values, _predictions) in enumerate(zip(values.tolist(), predictions.tolist())):
+            row = []
+            for v, p in zip(_values, _predictions):
+                # Copy is important since we're going to modify this array in place
+                tokens = input_ids.numpy().copy()
+                if target_ids is not None:
+                    p = target_ids[p].tolist()
+
+                tokens[masked_index[i]] = p
+                # Filter padding out:
+                tokens = tokens[np.where(tokens != self.tokenizer.pad_token_id)]
+                # Originally we skip special tokens to give readable output.
+                # For multi masks though, the other [MASK] would be removed otherwise
+                # making the output look odd, so we add them back
+                sequence = self.tokenizer.decode(tokens, skip_special_tokens=single_mask)
+                proposition = {"score": v, "token": p, "token_str": self.tokenizer.decode([p]), "sequence": sequence}
+                row.append(proposition)
+            result.append(row)
+        if single_mask:
+            return result[0]
+        return result
+
+    def get_target_ids(self, targets):
+        if isinstance(targets, str):
+            targets = [targets]
+        try:
+            vocab = self.tokenizer.get_vocab()
+        except Exception:
+            vocab = {}
+        target_ids = []
+        for target in targets:
+            id_ = vocab.get(target)
+            if id_ is None:
+                input_ids = self.tokenizer(
+                    target,
+                    add_special_tokens=False,
+                    return_attention_mask=False,
+                    return_token_type_ids=False,
+                    max_length=1,
+                    truncation=True,
+                )["input_ids"]
+                if len(input_ids) == 0:
+                    logger.warning(
+                        f"The specified target token `{target}` does not exist in the model vocabulary. "
+                        "We cannot replace it with anything meaningful, ignoring it"
+                    )
+                    continue
+                id_ = input_ids[0]
+                # XXX: If users encounter this pass
+                # it becomes pretty slow, so let's make sure
+                # The warning enables them to fix the input to
+                # get faster performance.
+                logger.warning(
+                    f"The specified target token `{target}` does not exist in the model vocabulary. "
+                    f"Replacing with `{self.tokenizer.convert_ids_to_tokens(id_)}`."
+                )
+            target_ids.append(id_)
+        target_ids = list(set(target_ids))
+        if len(target_ids) == 0:
+            raise ValueError("At least one target must be provided when passed.")
+        target_ids = np.array(target_ids)
+        return target_ids
+
+    def _sanitize_parameters(self, top_k=None, targets=None, tokenizer_kwargs=None):
+        preprocess_params = {}
+
+        if tokenizer_kwargs is not None:
+            preprocess_params["tokenizer_kwargs"] = tokenizer_kwargs
+
+        postprocess_params = {}
+
+        if targets is not None:
+            target_ids = self.get_target_ids(targets)
+            postprocess_params["target_ids"] = target_ids
+
+        if top_k is not None:
+            postprocess_params["top_k"] = top_k
+
+        if self.tokenizer.mask_token_id is None:
+            raise PipelineException(
+                "fill-mask", self.model.base_model_prefix, "The tokenizer does not define a `mask_token`."
+            )
+        return preprocess_params, {}, postprocess_params
+
+    @overload
+    def __call__(self, inputs: str, **kwargs: Any) -> list[dict[str, Any]]: ...
+
+    @overload
+    def __call__(self, inputs: list[str], **kwargs: Any) -> list[list[dict[str, Any]]]: ...
+
+    def __call__(
+        self, inputs: Union[str, list[str]], **kwargs: Any
+    ) -> Union[list[dict[str, Any]], list[list[dict[str, Any]]]]:
+        """
+        Fill the masked token in the text(s) given as inputs.
+
+        Args:
+            inputs (`str` or `list[str]`):
+                One or several texts (or one list of prompts) with masked tokens.
+            targets (`str` or `list[str]`, *optional*):
+                When passed, the model will limit the scores to the passed targets instead of looking up in the whole
+                vocab. If the provided targets are not in the model vocab, they will be tokenized and the first
+                resulting token will be used (with a warning, and that might be slower).
+            top_k (`int`, *optional*):
+                When passed, overrides the number of predictions to return.
+
+        Return:
+            A list or a list of list of `dict`: Each result comes as list of dictionaries with the following keys:
+
+            - **sequence** (`str`) -- The corresponding input with the mask token prediction.
+            - **score** (`float`) -- The corresponding probability.
+            - **token** (`int`) -- The predicted token id (to replace the masked one).
+            - **token_str** (`str`) -- The predicted token (to replace the masked one).
+        """
+        outputs = super().__call__(inputs, **kwargs)
+        if isinstance(inputs, list) and len(inputs) == 1:
+            return outputs[0]
+        return outputs
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/image_classification.py b/venv/lib/python3.13/site-packages/transformers/pipelines/image_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..a30c45285982a0fb0dd674f27a5ad90447314973
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/image_classification.py
@@ -0,0 +1,238 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Union, overload
+
+import numpy as np
+
+from ..utils import (
+    ExplicitEnum,
+    add_end_docstrings,
+    is_tf_available,
+    is_torch_available,
+    is_vision_available,
+    logging,
+    requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image
+
+if is_tf_available():
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.pipelines.text_classification.sigmoid
+def sigmoid(_outputs):
+    return 1.0 / (1.0 + np.exp(-_outputs))
+
+
+# Copied from transformers.pipelines.text_classification.softmax
+def softmax(_outputs):
+    maxes = np.max(_outputs, axis=-1, keepdims=True)
+    shifted_exp = np.exp(_outputs - maxes)
+    return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
+
+
+# Copied from transformers.pipelines.text_classification.ClassificationFunction
+class ClassificationFunction(ExplicitEnum):
+    SIGMOID = "sigmoid"
+    SOFTMAX = "softmax"
+    NONE = "none"
+
+
+@add_end_docstrings(
+    build_pipeline_init_args(has_image_processor=True),
+    r"""
+        function_to_apply (`str`, *optional*, defaults to `"default"`):
+            The function to apply to the model outputs in order to retrieve the scores. Accepts four different values:
+
+            - `"default"`: if the model has a single label, will apply the sigmoid function on the output. If the model
+              has several labels, will apply the softmax function on the output.
+            - `"sigmoid"`: Applies the sigmoid function on the output.
+            - `"softmax"`: Applies the softmax function on the output.
+            - `"none"`: Does not apply any function on the output.""",
+)
+class ImageClassificationPipeline(Pipeline):
+    """
+    Image classification pipeline using any `AutoModelForImageClassification`. This pipeline predicts the class of an
+    image.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> classifier = pipeline(model="microsoft/beit-base-patch16-224-pt22k-ft22k")
+    >>> classifier("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
+    [{'score': 0.442, 'label': 'macaw'}, {'score': 0.088, 'label': 'popinjay'}, {'score': 0.075, 'label': 'parrot'}, {'score': 0.073, 'label': 'parodist, lampooner'}, {'score': 0.046, 'label': 'poll, poll_parrot'}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This image classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"image-classification"`.
+
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=image-classification).
+    """
+
+    function_to_apply: ClassificationFunction = ClassificationFunction.NONE
+    _load_processor = False
+    _load_image_processor = True
+    _load_feature_extractor = False
+    _load_tokenizer = False
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        requires_backends(self, "vision")
+        self.check_model_type(
+            TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
+            if self.framework == "tf"
+            else MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
+        )
+
+    def _sanitize_parameters(self, top_k=None, function_to_apply=None, timeout=None):
+        preprocess_params = {}
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
+        postprocess_params = {}
+        if top_k is not None:
+            postprocess_params["top_k"] = top_k
+        if isinstance(function_to_apply, str):
+            function_to_apply = ClassificationFunction(function_to_apply.lower())
+        if function_to_apply is not None:
+            postprocess_params["function_to_apply"] = function_to_apply
+        return preprocess_params, {}, postprocess_params
+
+    @overload
+    def __call__(self, inputs: Union[str, "Image.Image"], **kwargs: Any) -> list[dict[str, Any]]: ...
+
+    @overload
+    def __call__(self, inputs: Union[list[str], list["Image.Image"]], **kwargs: Any) -> list[list[dict[str, Any]]]: ...
+
+    def __call__(
+        self, inputs: Union[str, list[str], "Image.Image", list["Image.Image"]], **kwargs: Any
+    ) -> Union[list[dict[str, Any]], list[list[dict[str, Any]]]]:
+        """
+        Assign labels to the image(s) passed as inputs.
+
+        Args:
+            inputs (`str`, `list[str]`, `PIL.Image` or `list[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing a http link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
+                Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
+                images.
+            function_to_apply (`str`, *optional*, defaults to `"default"`):
+                The function to apply to the model outputs in order to retrieve the scores. Accepts four different
+                values:
+
+                If this argument is not specified, then it will apply the following functions according to the number
+                of labels:
+
+                - If the model has a single label, will apply the sigmoid function on the output.
+                - If the model has several labels, will apply the softmax function on the output.
+
+                Possible values are:
+
+                - `"sigmoid"`: Applies the sigmoid function on the output.
+                - `"softmax"`: Applies the softmax function on the output.
+                - `"none"`: Does not apply any function on the output.
+            top_k (`int`, *optional*, defaults to 5):
+                The number of top labels that will be returned by the pipeline. If the provided number is higher than
+                the number of labels available in the model configuration, it will default to the number of labels.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
+        Return:
+            A dictionary or a list of dictionaries containing result. If the input is a single image, will return a
+            dictionary, if the input is a list of several images, will return a list of dictionaries corresponding to
+            the images.
+
+            The dictionaries contain the following keys:
+
+            - **label** (`str`) -- The label identified by the model.
+            - **score** (`int`) -- The score attributed by the model for that label.
+        """
+        # After deprecation of this is completed, remove the default `None` value for `images`
+        if "images" in kwargs:
+            inputs = kwargs.pop("images")
+        if inputs is None:
+            raise ValueError("Cannot call the image-classification pipeline without an inputs argument!")
+        return super().__call__(inputs, **kwargs)
+
+    def preprocess(self, image, timeout=None):
+        image = load_image(image, timeout=timeout)
+        model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+        if self.framework == "pt":
+            model_inputs = model_inputs.to(self.dtype)
+        return model_inputs
+
+    def _forward(self, model_inputs):
+        model_outputs = self.model(**model_inputs)
+        return model_outputs
+
+    def postprocess(self, model_outputs, function_to_apply=None, top_k=5):
+        if function_to_apply is None:
+            if self.model.config.problem_type == "multi_label_classification" or self.model.config.num_labels == 1:
+                function_to_apply = ClassificationFunction.SIGMOID
+            elif self.model.config.problem_type == "single_label_classification" or self.model.config.num_labels > 1:
+                function_to_apply = ClassificationFunction.SOFTMAX
+            elif hasattr(self.model.config, "function_to_apply") and function_to_apply is None:
+                function_to_apply = self.model.config.function_to_apply
+            else:
+                function_to_apply = ClassificationFunction.NONE
+
+        if top_k > self.model.config.num_labels:
+            top_k = self.model.config.num_labels
+
+        outputs = model_outputs["logits"][0]
+        if self.framework == "pt" and outputs.dtype in (torch.bfloat16, torch.float16):
+            outputs = outputs.to(torch.float32).numpy()
+        else:
+            outputs = outputs.numpy()
+
+        if function_to_apply == ClassificationFunction.SIGMOID:
+            scores = sigmoid(outputs)
+        elif function_to_apply == ClassificationFunction.SOFTMAX:
+            scores = softmax(outputs)
+        elif function_to_apply == ClassificationFunction.NONE:
+            scores = outputs
+        else:
+            raise ValueError(f"Unrecognized `function_to_apply` argument: {function_to_apply}")
+
+        dict_scores = [
+            {"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(scores)
+        ]
+        dict_scores.sort(key=lambda x: x["score"], reverse=True)
+        if top_k is not None:
+            dict_scores = dict_scores[:top_k]
+
+        return dict_scores
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/image_feature_extraction.py b/venv/lib/python3.13/site-packages/transformers/pipelines/image_feature_extraction.py
new file mode 100644
index 0000000000000000000000000000000000000000..a87ecafb684e271d49c341ee696df5e023b52f35
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/image_feature_extraction.py
@@ -0,0 +1,119 @@
+from typing import Any, Union
+
+from ..utils import add_end_docstrings, is_vision_available
+from .base import GenericTensor, Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image
+
+
+@add_end_docstrings(
+    build_pipeline_init_args(has_image_processor=True),
+    """
+        image_processor_kwargs (`dict`, *optional*):
+                Additional dictionary of keyword arguments passed along to the image processor e.g.
+                {"size": {"height": 100, "width": 100}}
+        pool (`bool`, *optional*, defaults to `False`):
+            Whether or not to return the pooled output. If `False`, the model will return the raw hidden states.
+    """,
+)
+class ImageFeatureExtractionPipeline(Pipeline):
+    """
+    Image feature extraction pipeline uses no model head. This pipeline extracts the hidden states from the base
+    transformer, which can be used as features in downstream tasks.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> extractor = pipeline(model="google/vit-base-patch16-224", task="image-feature-extraction")
+    >>> result = extractor("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", return_tensors=True)
+    >>> result.shape  # This is a tensor of shape [1, sequence_length, hidden_dimension] representing the input image.
+    torch.Size([1, 197, 768])
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This image feature extraction pipeline can currently be loaded from [`pipeline`] using the task identifier:
+    `"image-feature-extraction"`.
+
+    All vision models may be used for this pipeline. See a list of all models, including community-contributed models on
+    [huggingface.co/models](https://huggingface.co/models).
+    """
+
+    _load_processor = False
+    _load_image_processor = True
+    _load_feature_extractor = False
+    _load_tokenizer = False
+
+    def _sanitize_parameters(self, image_processor_kwargs=None, return_tensors=None, pool=None, **kwargs):
+        preprocess_params = {} if image_processor_kwargs is None else image_processor_kwargs
+
+        postprocess_params = {}
+        if pool is not None:
+            postprocess_params["pool"] = pool
+        if return_tensors is not None:
+            postprocess_params["return_tensors"] = return_tensors
+
+        if "timeout" in kwargs:
+            preprocess_params["timeout"] = kwargs["timeout"]
+
+        return preprocess_params, {}, postprocess_params
+
+    def preprocess(self, image, timeout=None, **image_processor_kwargs) -> dict[str, GenericTensor]:
+        image = load_image(image, timeout=timeout)
+        model_inputs = self.image_processor(image, return_tensors=self.framework, **image_processor_kwargs)
+        if self.framework == "pt":
+            model_inputs = model_inputs.to(self.dtype)
+        return model_inputs
+
+    def _forward(self, model_inputs):
+        model_outputs = self.model(**model_inputs)
+        return model_outputs
+
+    def postprocess(self, model_outputs, pool=None, return_tensors=False):
+        pool = pool if pool is not None else False
+
+        if pool:
+            if "pooler_output" not in model_outputs:
+                raise ValueError(
+                    "No pooled output was returned. Make sure the model has a `pooler` layer when using the `pool` option."
+                )
+            outputs = model_outputs["pooler_output"]
+        else:
+            # [0] is the first available tensor, logits or last_hidden_state.
+            outputs = model_outputs[0]
+
+        if return_tensors:
+            return outputs
+        if self.framework == "pt":
+            return outputs.tolist()
+        elif self.framework == "tf":
+            return outputs.numpy().tolist()
+
+    def __call__(self, *args: Union[str, "Image.Image", list["Image.Image"], list[str]], **kwargs: Any) -> list[Any]:
+        """
+        Extract the features of the input(s).
+
+        Args:
+            images (`str`, `list[str]`, `PIL.Image` or `list[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing a http link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
+                Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
+                images.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is used and
+                the call may block forever.
+        Return:
+            A nested list of `float`: The features computed by the model.
+        """
+        return super().__call__(*args, **kwargs)
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/image_segmentation.py b/venv/lib/python3.13/site-packages/transformers/pipelines/image_segmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed09f15d5e137784be33c1cf05b2a9dd2a8ac0cc
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/image_segmentation.py
@@ -0,0 +1,228 @@
+from typing import Any, Union, overload
+
+import numpy as np
+
+from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import (
+        MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES,
+        MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES,
+        MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES,
+        MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES,
+    )
+
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
+class ImageSegmentationPipeline(Pipeline):
+    """
+    Image segmentation pipeline using any `AutoModelForXXXSegmentation`. This pipeline predicts masks of objects and
+    their classes.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> segmenter = pipeline(model="facebook/detr-resnet-50-panoptic")
+    >>> segments = segmenter("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
+    >>> len(segments)
+    2
+
+    >>> segments[0]["label"]
+    'bird'
+
+    >>> segments[1]["label"]
+    'bird'
+
+    >>> type(segments[0]["mask"])  # This is a black and white mask showing where is the bird on the original image.
+    <class 'PIL.Image.Image'>
+
+    >>> segments[0]["mask"].size
+    (768, 512)
+    ```
+
+
+    This image segmentation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"image-segmentation"`.
+
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=image-segmentation).
+    """
+
+    _load_processor = False
+    _load_image_processor = True
+    _load_feature_extractor = False
+    _load_tokenizer = None  # Oneformer uses it but no-one else does
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        if self.framework == "tf":
+            raise ValueError(f"The {self.__class__} is only available in PyTorch.")
+
+        requires_backends(self, "vision")
+        mapping = MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES.copy()
+        mapping.update(MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES)
+        mapping.update(MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES)
+        mapping.update(MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES)
+        self.check_model_type(mapping)
+
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        postprocess_kwargs = {}
+        if "subtask" in kwargs:
+            postprocess_kwargs["subtask"] = kwargs["subtask"]
+            preprocess_kwargs["subtask"] = kwargs["subtask"]
+        if "threshold" in kwargs:
+            postprocess_kwargs["threshold"] = kwargs["threshold"]
+        if "mask_threshold" in kwargs:
+            postprocess_kwargs["mask_threshold"] = kwargs["mask_threshold"]
+        if "overlap_mask_area_threshold" in kwargs:
+            postprocess_kwargs["overlap_mask_area_threshold"] = kwargs["overlap_mask_area_threshold"]
+        if "timeout" in kwargs:
+            preprocess_kwargs["timeout"] = kwargs["timeout"]
+
+        return preprocess_kwargs, {}, postprocess_kwargs
+
+    @overload
+    def __call__(self, inputs: Union[str, "Image.Image"], **kwargs: Any) -> list[dict[str, Any]]: ...
+
+    @overload
+    def __call__(self, inputs: Union[list[str], list["Image.Image"]], **kwargs: Any) -> list[list[dict[str, Any]]]: ...
+
+    def __call__(
+        self, inputs: Union[str, "Image.Image", list[str], list["Image.Image"]], **kwargs: Any
+    ) -> Union[list[dict[str, Any]], list[list[dict[str, Any]]]]:
+        """
+        Perform segmentation (detect masks & classes) in the image(s) passed as inputs.
+
+        Args:
+            inputs (`str`, `list[str]`, `PIL.Image` or `list[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing an HTTP(S) link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images. Images in a batch must all be in the
+                same format: all as HTTP(S) links, all as local paths, or all as PIL images.
+            subtask (`str`, *optional*):
+                Segmentation task to be performed, choose [`semantic`, `instance` and `panoptic`] depending on model
+                capabilities. If not set, the pipeline will attempt tp resolve in the following order:
+                  `panoptic`, `instance`, `semantic`.
+            threshold (`float`, *optional*, defaults to 0.9):
+                Probability threshold to filter out predicted masks.
+            mask_threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+            overlap_mask_area_threshold (`float`, *optional*, defaults to 0.5):
+                Mask overlap threshold to eliminate small, disconnected segments.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
+        Return:
+            If the input is a single image, will return a list of dictionaries, if the input is a list of several images,
+            will return a list of list of dictionaries corresponding to each image.
+
+            The dictionaries contain the mask, label and score (where applicable) of each detected object and contains
+            the following keys:
+
+            - **label** (`str`) -- The class label identified by the model.
+            - **mask** (`PIL.Image`) -- A binary mask of the detected object as a Pil Image of shape (width, height) of
+              the original image. Returns a mask filled with zeros if no object is found.
+            - **score** (*optional* `float`) -- Optionally, when the model is capable of estimating a confidence of the
+              "object" described by the label and the mask.
+        """
+        # After deprecation of this is completed, remove the default `None` value for `images`
+        if "images" in kwargs:
+            inputs = kwargs.pop("images")
+        if inputs is None:
+            raise ValueError("Cannot call the image-classification pipeline without an inputs argument!")
+        return super().__call__(inputs, **kwargs)
+
+    def preprocess(self, image, subtask=None, timeout=None):
+        image = load_image(image, timeout=timeout)
+        target_size = [(image.height, image.width)]
+        if self.model.config.__class__.__name__ == "OneFormerConfig":
+            if subtask is None:
+                kwargs = {}
+            else:
+                kwargs = {"task_inputs": [subtask]}
+            inputs = self.image_processor(images=[image], return_tensors="pt", **kwargs)
+            if self.framework == "pt":
+                inputs = inputs.to(self.dtype)
+            inputs["task_inputs"] = self.tokenizer(
+                inputs["task_inputs"],
+                padding="max_length",
+                max_length=self.model.config.task_seq_len,
+                return_tensors=self.framework,
+            )["input_ids"]
+        else:
+            inputs = self.image_processor(images=[image], return_tensors="pt")
+            if self.framework == "pt":
+                inputs = inputs.to(self.dtype)
+        inputs["target_size"] = target_size
+        return inputs
+
+    def _forward(self, model_inputs):
+        target_size = model_inputs.pop("target_size")
+        model_outputs = self.model(**model_inputs)
+        model_outputs["target_size"] = target_size
+        return model_outputs
+
+    def postprocess(
+        self, model_outputs, subtask=None, threshold=0.9, mask_threshold=0.5, overlap_mask_area_threshold=0.5
+    ):
+        fn = None
+        if subtask in {"panoptic", None} and hasattr(self.image_processor, "post_process_panoptic_segmentation"):
+            fn = self.image_processor.post_process_panoptic_segmentation
+        elif subtask in {"instance", None} and hasattr(self.image_processor, "post_process_instance_segmentation"):
+            fn = self.image_processor.post_process_instance_segmentation
+
+        if fn is not None:
+            outputs = fn(
+                model_outputs,
+                threshold=threshold,
+                mask_threshold=mask_threshold,
+                overlap_mask_area_threshold=overlap_mask_area_threshold,
+                target_sizes=model_outputs["target_size"],
+            )[0]
+
+            annotation = []
+            segmentation = outputs["segmentation"]
+
+            for segment in outputs["segments_info"]:
+                mask = (segmentation == segment["id"]) * 255
+                mask = Image.fromarray(mask.numpy().astype(np.uint8), mode="L")
+                label = self.model.config.id2label[segment["label_id"]]
+                score = segment["score"]
+                annotation.append({"score": score, "label": label, "mask": mask})
+
+        elif subtask in {"semantic", None} and hasattr(self.image_processor, "post_process_semantic_segmentation"):
+            outputs = self.image_processor.post_process_semantic_segmentation(
+                model_outputs, target_sizes=model_outputs["target_size"]
+            )[0]
+
+            annotation = []
+            segmentation = outputs.numpy()
+            labels = np.unique(segmentation)
+
+            for label in labels:
+                mask = (segmentation == label) * 255
+                mask = Image.fromarray(mask.astype(np.uint8), mode="L")
+                label = self.model.config.id2label[label]
+                annotation.append({"score": None, "label": label, "mask": mask})
+        else:
+            raise ValueError(f"Subtask {subtask} is not supported for model {type(self.model)}")
+        return annotation
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/image_text_to_text.py b/venv/lib/python3.13/site-packages/transformers/pipelines/image_text_to_text.py
new file mode 100644
index 0000000000000000000000000000000000000000..2564d53ba1d7f4adbfd524a8661d17045ff0c712
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/image_text_to_text.py
@@ -0,0 +1,519 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import enum
+from collections.abc import Iterable
+from typing import Any, Optional, Union, overload
+
+from ..generation import GenerationConfig
+from ..processing_utils import ProcessingKwargs, Unpack
+from ..utils import (
+    add_end_docstrings,
+    is_torch_available,
+    is_vision_available,
+    logging,
+    requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_images, valid_images
+
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
+    from .pt_utils import KeyDataset
+
+logger = logging.get_logger(__name__)
+
+IMAGE_TOKEN = "<image>"
+
+
+class ReturnType(enum.Enum):
+    TENSORS = 0
+    NEW_TEXT = 1
+    FULL_TEXT = 2
+
+
+class Chat:
+    """This class is intended to just be used internally in this pipeline and not exposed to users. We convert chats
+    to this format because the rest of the pipeline code tends to assume that lists of messages are
+    actually a batch of samples rather than messages in the same conversation."""
+
+    def __init__(
+        self, messages: dict, images: Optional[Union[str, list[str], "Image.Image", list["Image.Image"]]] = None
+    ):
+        for message in messages:
+            if not ("role" in message and "content" in message):
+                raise ValueError("When passing chat dicts as input, each dict must have a 'role' and 'content' key.")
+        messages = add_images_to_messages(messages, images)
+
+        self.messages = messages
+
+
+def add_images_to_messages(
+    messages: dict, images: Optional[Union[str, list[str], "Image.Image", list["Image.Image"]]]
+):
+    """
+    Retrieve and combine images from the chat and the images passed as input.
+    """
+    if images is None:
+        images = []
+    elif not isinstance(images, Iterable) or isinstance(images, str):
+        images = [images]
+    idx_images = 0
+    for message in messages:
+        for content in message["content"]:
+            if not isinstance(content, dict):
+                continue
+            content_type = content.get("type")
+            if content_type == "image":
+                if not any(key in content for key in ["image", "url", "path", "base64"]):
+                    if idx_images < len(images):
+                        # Insert the image passed as argument in the chat message
+                        content["image"] = images[idx_images]
+                        idx_images += 1
+                    else:
+                        raise ValueError(
+                            "The number of images in the chat messages should be the same as the number of images passed to the pipeline."
+                        )
+            # Add support for OpenAI/TGI chat format
+            elif content_type == "image_url":
+                if isinstance(content.get("image_url"), dict) and "url" in content["image_url"]:
+                    # Rewrite content to be in the Transformers chat format
+                    content["type"] = "image"
+                    content["image"] = content["image_url"]["url"]
+                    del content["image_url"]
+                else:
+                    raise ValueError(
+                        "Wrong format for 'image_url' content type. The content should have an 'image_url' dict with a 'url' key."
+                    )
+
+    # The number of images passed should be consistent with the number of images in the chat without an image key
+    if idx_images != len(images):
+        raise ValueError(
+            "The number of images in the chat messages should be the same as the number of images passed to the pipeline."
+        )
+
+    return messages
+
+
+@add_end_docstrings(build_pipeline_init_args(has_processor=True))
+class ImageTextToTextPipeline(Pipeline):
+    """
+    Image-text-to-text pipeline using an `AutoModelForImageTextToText`. This pipeline generates text given an image and text.
+    When the underlying model is a conversational model, it can also accept one or more chats,
+    in which case the pipeline will operate in chat mode and will continue the chat(s) by adding its response(s).
+    Each chat takes the form of a list of dicts, where each dict contains "role" and "content" keys.
+
+    Unless the model you're using explicitly sets these generation parameters in its configuration files
+    (`generation_config.json`), the following default values will be used:
+    - max_new_tokens: 256
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> pipe = pipeline(task="image-text-to-text", model="Salesforce/blip-image-captioning-base")
+    >>> pipe("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", text="A photo of")
+    [{'generated_text': 'a photo of two birds'}]
+    ```
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+    >>> messages = [
+    >>>     {
+    >>>         "role": "user",
+    >>>         "content": [
+    >>>             {
+    >>>                 "type": "image",
+    >>>                 "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+    >>>             },
+    >>>             {"type": "text", "text": "Describe this image."},
+    >>>         ],
+    >>>     },
+    >>>     {
+    >>>         "role": "assistant",
+    >>>         "content": [
+    >>>             {"type": "text", "text": "There is a dog and"},
+    >>>         ],
+    >>>     },
+    >>> ]
+    >>> pipe(text=messages, max_new_tokens=20, return_full_text=False)
+    [{'input_text': [{'role': 'user',
+        'content': [{'type': 'image',
+        'url': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'},
+        {'type': 'text', 'text': 'Describe this image.'}]},
+    {'role': 'assistant',
+        'content': [{'type': 'text', 'text': 'There is a dog and'}]}],
+    'generated_text': ' a person in the image. The dog is sitting on the sand, and the person is sitting on'}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This image-text to text pipeline can currently be loaded from pipeline() using the following task identifier:
+    "image-text-to-text".
+
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?pipeline_tag=image-text-to-text).
+    """
+
+    _load_processor = True
+    _load_image_processor = False
+    _load_feature_extractor = False
+    _load_tokenizer = False
+
+    _pipeline_calls_generate = True
+    # Make sure the docstring is updated when the default generation config is changed
+    _default_generation_config = GenerationConfig(
+        max_new_tokens=256,
+    )
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        requires_backends(self, "vision")
+        self.check_model_type(MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES)
+
+    def _sanitize_parameters(
+        self,
+        max_new_tokens=None,
+        generate_kwargs=None,
+        timeout=None,
+        return_full_text=None,
+        return_tensors=None,
+        return_type=None,
+        clean_up_tokenization_spaces=None,
+        stop_sequence=None,
+        continue_final_message=None,
+        skip_special_tokens=None,
+        **kwargs: Unpack[ProcessingKwargs],
+    ):
+        forward_kwargs = {}
+        preprocess_params = {}
+        postprocess_params = {}
+
+        # Preprocess params
+        preprocess_params.update(kwargs)
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
+        if continue_final_message is not None:
+            preprocess_params["continue_final_message"] = continue_final_message
+
+        # Forward kwargs
+        if generate_kwargs is not None:
+            forward_kwargs["generate_kwargs"] = generate_kwargs
+        if stop_sequence is not None:
+            stop_sequence_ids = self.processor.tokenizer.encode(stop_sequence, add_special_tokens=False)
+            if len(stop_sequence_ids) > 1:
+                logger.warning_once(
+                    "Stopping on a multiple token sequence is not yet supported on transformers. The first token of"
+                    " the stop sequence will be used as the stop sequence string in the interim."
+                )
+            generate_kwargs["eos_token_id"] = stop_sequence_ids[0]
+        if generate_kwargs is not None:
+            forward_kwargs["generate_kwargs"] = generate_kwargs
+        if max_new_tokens is not None:
+            if "generate_kwargs" not in forward_kwargs:
+                forward_kwargs["generate_kwargs"] = {}
+            if "max_new_tokens" in forward_kwargs["generate_kwargs"]:
+                raise ValueError(
+                    "'max_new_tokens' is defined twice, once in 'generate_kwargs' and once as a direct parameter,"
+                    " please use only one"
+                )
+            forward_kwargs["generate_kwargs"]["max_new_tokens"] = max_new_tokens
+
+        # Postprocess params
+        if return_full_text is not None and return_type is None:
+            if return_tensors is not None:
+                raise ValueError("`return_full_text` is mutually exclusive with `return_tensors`")
+            return_type = ReturnType.FULL_TEXT if return_full_text else ReturnType.NEW_TEXT
+        if return_tensors is not None and return_type is None:
+            return_type = ReturnType.TENSORS
+        if return_type is not None:
+            postprocess_params["return_type"] = return_type
+        if continue_final_message is not None:
+            postprocess_params["continue_final_message"] = continue_final_message
+        if clean_up_tokenization_spaces is not None:
+            postprocess_params["clean_up_tokenization_spaces"] = clean_up_tokenization_spaces
+        if skip_special_tokens is not None:
+            postprocess_params["skip_special_tokens"] = skip_special_tokens
+
+        return preprocess_params, forward_kwargs, postprocess_params
+
+    @overload
+    def __call__(
+        self,
+        image: Optional[Union[str, "Image.Image"]] = None,
+        text: Optional[str] = None,
+        **kwargs: Any,
+    ) -> list[dict[str, Any]]: ...
+
+    @overload
+    def __call__(
+        self,
+        image: Optional[Union[list[str], list["Image.Image"]]] = None,
+        text: Optional[list[str]] = None,
+        **kwargs: Any,
+    ) -> list[list[dict[str, Any]]]: ...
+
+    def __call__(
+        self,
+        images: Optional[
+            Union[
+                str,
+                list[str],
+                list[list[str]],
+                "Image.Image",
+                list["Image.Image"],
+                list[list["Image.Image"]],
+                list[dict],
+            ]
+        ] = None,
+        text: Optional[Union[str, list[str], list[dict]]] = None,
+        **kwargs,
+    ) -> Union[list[dict[str, Any]], list[list[dict[str, Any]]]]:
+        """
+        Generate a text given text and the image(s) passed as inputs.
+
+        Args:
+            images (`str`, `list[str]`, `PIL.Image, `list[PIL.Image]`, `list[dict[str, Union[str, PIL.Image]]]`):
+                The pipeline handles three types of images:
+
+                - A string containing a HTTP(s) link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images. Finally, this pipeline also supports
+                the chat format (see `text`) containing images and text in this argument.
+            text (str, list[str], `list[dict[str, Union[str, PIL.Image]]]`):
+                The text to be used for generation. If a list of strings is passed, the length of the list should be
+                the same as the number of images. Text can also follow the chat format: a list of dictionaries where
+                each dictionary represents a message in a conversation. Each dictionary should have two keys: 'role'
+                and 'content'. 'role' should be one of 'user', 'system' or 'assistant'. 'content' should be a list of
+                dictionary containing the text of the message and the type of the message. The type of the message
+                can be either 'text' or 'image'. If the type is 'image', no text is needed.
+            return_tensors (`bool`, *optional*, defaults to `False`):
+                Returns the tensors of predictions (as token indices) in the outputs. If set to
+                `True`, the decoded text is not returned.
+            return_text (`bool`, *optional*):
+                Returns the decoded texts in the outputs.
+            return_full_text (`bool`, *optional*, defaults to `True`):
+                If set to `False` only added text is returned, otherwise the full text is returned. Cannot be
+                specified at the same time as `return_text`.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean up the potential extra spaces in the text output.
+            continue_final_message( `bool`, *optional*): This indicates that you want the model to continue the
+                last message in the input chat rather than starting a new one, allowing you to "prefill" its response.
+                By default this is `True` when the final message in the input chat has the `assistant` role and
+                `False` otherwise, but you can manually override that behaviour by setting this flag.
+
+        Return:
+            A list or a list of list of `dict`: Each result comes as a dictionary with the following key (cannot
+            return a combination of both `generated_text` and `generated_token_ids`):
+
+            - **generated_text** (`str`, present when `return_text=True`) -- The generated text.
+            - **generated_token_ids** (`torch.Tensor`, present when `return_tensors=True`) -- The token
+                ids of the generated text.
+            - **input_text** (`str`) -- The input text.
+        """
+        if images is None and text is None:
+            raise ValueError("You must at least provide either text or images.")
+
+        def _is_chat(arg):
+            return isinstance(arg, (list, tuple, KeyDataset)) and isinstance(arg[0], (list, tuple, dict))
+
+        if _is_chat(text):
+            # We have one or more prompts in list-of-dicts format, so this is chat mode
+            if isinstance(text[0], dict):
+                return super().__call__(Chat(text, images), **kwargs)
+            else:
+                if images is None:
+                    images = [None] * len(text)
+                chats = [Chat(chat, image) for chat, image in zip(text, images)]  # 🐈 🐈 🐈
+                return super().__call__(chats, **kwargs)
+
+        # Same as above, but the `images` argument contains the chat. This can happen e.g. is the user only passes a
+        # chat as a positional argument.
+        elif text is None and _is_chat(images):
+            # We have one or more prompts in list-of-dicts format, so this is chat mode
+            if isinstance(images[0], dict):
+                return super().__call__(Chat(images), **kwargs)
+            else:
+                chats = [Chat(image) for image in images]  # 🐈 🐈 🐈
+                return super().__call__(chats, **kwargs)
+
+        elif images is not None and text is None and not valid_images(images):
+            """
+            Supports the following format
+            - {"image": image, "text": text}
+            - [{"image": image, "text": text}]
+            - Generator and datasets
+            This is a common pattern in other multimodal pipelines, so we support it here as well.
+            """
+            return super().__call__(images, **kwargs)
+
+        # encourage the user to use the chat format if supported
+        if getattr(self.processor, "chat_template", None) is not None:
+            logger.warning_once(
+                "The input data was not formatted as a chat with dicts containing 'role' and 'content' keys, even "
+                "though this model supports chat. Consider using the chat format for better results. For more "
+                "information, see https://huggingface.co/docs/transformers/en/chat_templating"
+            )
+
+        # support text only generation
+        if images is None:
+            return super().__call__(text, **kwargs)
+        if text is None:
+            raise ValueError("You must provide text for this pipeline.")
+
+        return super().__call__({"images": images, "text": text}, **kwargs)
+
+    def preprocess(self, inputs=None, timeout=None, continue_final_message=None, **processing_kwargs):
+        if isinstance(inputs, Chat):
+            # If the user passes a chat that ends in an assistant message, we treat it as a prefill by default
+            # because very few models support multiple separate, consecutive assistant messages
+            if continue_final_message is None:
+                continue_final_message = inputs.messages[-1]["role"] == "assistant"
+            model_inputs = self.processor.apply_chat_template(
+                inputs.messages,
+                add_generation_prompt=not continue_final_message,
+                continue_final_message=continue_final_message,
+                return_tensors=self.framework,
+                tokenize=True,
+                return_dict=True,
+            )
+            model_inputs["text"] = inputs
+            return model_inputs
+        # In case we only have text inputs
+        if isinstance(inputs, (list, tuple, str)):
+            images = None
+            text = inputs
+            inputs_text = inputs
+        else:
+            images = load_images(inputs["images"], timeout=timeout)
+            text = inputs["text"]
+            inputs_text = inputs["text"]
+
+        # if batched text inputs, we set padding to True unless specified otherwise
+        if isinstance(text, (list, tuple)) and len(text) > 1:
+            processing_kwargs.setdefault("padding", True)
+        model_inputs = self.processor(images=images, text=text, return_tensors=self.framework, **processing_kwargs).to(
+            dtype=self.dtype
+        )
+
+        model_inputs["text"] = inputs_text
+
+        return model_inputs
+
+    def _forward(self, model_inputs, generate_kwargs=None):
+        generate_kwargs = {} if generate_kwargs is None else generate_kwargs
+        prompt_text = model_inputs.pop("text")
+        input_ids = (
+            model_inputs["input_ids"] if "input_ids" in model_inputs else model_inputs["decoder_input_ids"]
+        )  # for decoder-only models
+
+        # User-defined `generation_config` passed to the pipeline call take precedence
+        if "generation_config" not in generate_kwargs:
+            generate_kwargs["generation_config"] = self.generation_config
+
+        generated_sequence = self.model.generate(**model_inputs, **generate_kwargs)
+
+        return {"generated_sequence": generated_sequence, "prompt_text": prompt_text, "input_ids": input_ids}
+
+    def postprocess(
+        self,
+        model_outputs,
+        return_type=ReturnType.FULL_TEXT,
+        continue_final_message=None,
+        skip_special_tokens=None,
+        **postprocess_kwargs,
+    ):
+        input_texts = model_outputs["prompt_text"]
+        input_texts = [input_texts] if isinstance(input_texts, (str, Chat)) else input_texts
+        generated_sequence = model_outputs["generated_sequence"]
+        input_ids = model_outputs["input_ids"]
+        if return_type == ReturnType.TENSORS:
+            return [
+                {"input_text": input_texts[i], "generated_token_ids": generated_sequence[i]}
+                for i in range(len(input_texts))
+            ]
+
+        # Decode inputs and outputs the same way to remove input text from generated text if present
+        skip_special_tokens = skip_special_tokens if skip_special_tokens is not None else True
+        generated_texts = self.processor.post_process_image_text_to_text(
+            generated_sequence, skip_special_tokens=skip_special_tokens, **postprocess_kwargs
+        )
+        decoded_inputs = self.processor.post_process_image_text_to_text(
+            input_ids, skip_special_tokens=skip_special_tokens, **postprocess_kwargs
+        )
+
+        # Force consistent behavior for including the input text in the output
+        if return_type in {ReturnType.NEW_TEXT, ReturnType.FULL_TEXT}:
+            # Remove the input text from the generated text if the generated text starts with the input text
+            # (accounting for the possibility of a space between the input and generated text)
+            new_generated_texts = []
+            for text_generated, decoded_input in zip(generated_texts, decoded_inputs):
+                # There can be added characters before the input text, so we need to find the beginning of the input text in the generated text
+                index_input_text = text_generated.find(decoded_input)
+                # Limit the search to 2 residual characters, like spaces or new lines, to avoid removing a large part of the answer
+                if 0 <= index_input_text <= 2:
+                    # If the input text is found, we remove it
+                    new_generated_texts.append(text_generated[index_input_text + len(decoded_input) :])
+                else:
+                    new_generated_texts.append(text_generated)
+            generated_texts = new_generated_texts
+        if return_type == ReturnType.FULL_TEXT:
+            full_texts = []
+            for prompt_text, generated_text in zip(input_texts, generated_texts):
+                if isinstance(prompt_text, str):
+                    generated_text = prompt_text + generated_text
+                elif isinstance(prompt_text, Chat):
+                    if continue_final_message is None:
+                        # If the user passes a chat ending in an assistant message, we treat it as a prefill by
+                        # default because very few models support multiple separate, consecutive assistant messages
+                        continue_final_message = prompt_text.messages[-1]["role"] == "assistant"
+                    if continue_final_message:
+                        # With assistant prefill, concat onto the end of the last message
+                        new_text = dict(prompt_text.messages[-1]["content"][-1].items())
+                        new_text["text"] += generated_text
+                        generated_text = list(prompt_text.messages)[:-1] + [
+                            {
+                                "role": prompt_text.messages[-1]["role"],
+                                "content": prompt_text.messages[-1]["content"][:-1] + [new_text],
+                            }
+                        ]
+                    else:
+                        # When we're not starting from a prefill, the output is a new assistant message
+                        generated_text = list(prompt_text.messages) + [
+                            {"role": "assistant", "content": generated_text}
+                        ]
+                full_texts.append(generated_text)
+            generated_texts = full_texts
+
+        records = [
+            {
+                "input_text": input_text.messages if isinstance(input_text, Chat) else input_text,
+                "generated_text": generated_text,
+            }
+            for input_text, generated_text in zip(input_texts, generated_texts)
+        ]
+
+        return records
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/image_to_image.py b/venv/lib/python3.13/site-packages/transformers/pipelines/image_to_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..d469024bff17656021bf10bf8b4c33cad8077025
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/image_to_image.py
@@ -0,0 +1,147 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Union, overload
+
+import numpy as np
+
+from ..utils import (
+    add_end_docstrings,
+    is_torch_available,
+    is_vision_available,
+    logging,
+    requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
+class ImageToImagePipeline(Pipeline):
+    """
+    Image to Image pipeline using any `AutoModelForImageToImage`. This pipeline generates an image based on a previous
+    image input.
+
+    Example:
+
+    ```python
+    >>> from PIL import Image
+    >>> import requests
+
+    >>> from transformers import pipeline
+
+    >>> upscaler = pipeline("image-to-image", model="caidas/swin2SR-classical-sr-x2-64")
+    >>> img = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+    >>> img = img.resize((64, 64))
+    >>> upscaled_img = upscaler(img)
+    >>> img.size
+    (64, 64)
+
+    >>> upscaled_img.size
+    (144, 144)
+    ```
+
+    This image to image pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"image-to-image"`.
+
+    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=image-to-image).
+    """
+
+    _load_processor = False
+    _load_image_processor = True
+    _load_feature_extractor = False
+    _load_tokenizer = False
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        requires_backends(self, "vision")
+        self.check_model_type(MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES)
+
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_params = {}
+        postprocess_params = {}
+        forward_params = {}
+
+        if "timeout" in kwargs:
+            preprocess_params["timeout"] = kwargs["timeout"]
+        if "head_mask" in kwargs:
+            forward_params["head_mask"] = kwargs["head_mask"]
+
+        return preprocess_params, forward_params, postprocess_params
+
+    @overload
+    def __call__(self, images: Union[str, "Image.Image"], **kwargs: Any) -> "Image.Image": ...
+
+    @overload
+    def __call__(self, images: Union[list[str], list["Image.Image"]], **kwargs: Any) -> list["Image.Image"]: ...
+
+    def __call__(
+        self, images: Union[str, list[str], "Image.Image", list["Image.Image"]], **kwargs: Any
+    ) -> Union["Image.Image", list["Image.Image"]]:
+        """
+        Transform the image(s) passed as inputs.
+
+        Args:
+            images (`str`, `list[str]`, `PIL.Image` or `list[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing a http link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
+                Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
+                images.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is used and
+                the call may block forever.
+
+        Return:
+            An image (Image.Image) or a list of images (list["Image.Image"]) containing result(s). If the input is a
+            single image, the return will be also a single image, if the input is a list of several images, it will
+            return a list of transformed images.
+        """
+        return super().__call__(images, **kwargs)
+
+    def _forward(self, model_inputs):
+        model_outputs = self.model(**model_inputs)
+        return model_outputs
+
+    def preprocess(self, image, timeout=None):
+        image = load_image(image, timeout=timeout)
+        inputs = self.image_processor(images=[image], return_tensors="pt")
+        if self.framework == "pt":
+            inputs = inputs.to(self.dtype)
+        return inputs
+
+    def postprocess(self, model_outputs):
+        images = []
+        if "reconstruction" in model_outputs:
+            outputs = model_outputs.reconstruction
+        for output in outputs:
+            output = output.data.squeeze().float().cpu().clamp_(0, 1).numpy()
+            output = np.moveaxis(output, source=0, destination=-1)
+            output = (output * 255.0).round().astype(np.uint8)  # float32 to uint8
+            images.append(Image.fromarray(output))
+
+        return images if len(images) > 1 else images[0]
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/image_to_text.py b/venv/lib/python3.13/site-packages/transformers/pipelines/image_to_text.py
new file mode 100644
index 0000000000000000000000000000000000000000..51f9e70cdd61526c6db58cb657efef6b3ac8e51b
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/image_to_text.py
@@ -0,0 +1,243 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Union, overload
+
+from ..generation import GenerationConfig
+from ..utils import (
+    add_end_docstrings,
+    is_tf_available,
+    is_torch_available,
+    is_vision_available,
+    logging,
+    requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image
+
+if is_tf_available():
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True, has_image_processor=True))
+class ImageToTextPipeline(Pipeline):
+    """
+    Image To Text pipeline using a `AutoModelForVision2Seq`. This pipeline predicts a caption for a given image.
+
+    Unless the model you're using explicitly sets these generation parameters in its configuration files
+    (`generation_config.json`), the following default values will be used:
+    - max_new_tokens: 256
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> captioner = pipeline(model="ydshieh/vit-gpt2-coco-en")
+    >>> captioner("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
+    [{'generated_text': 'two birds are standing next to each other '}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This image to text pipeline can currently be loaded from pipeline() using the following task identifier:
+    "image-to-text".
+
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?pipeline_tag=image-to-text).
+    """
+
+    _pipeline_calls_generate = True
+    _load_processor = False
+    _load_image_processor = True
+    _load_feature_extractor = False
+    _load_tokenizer = True
+    # Make sure the docstring is updated when the default generation config is changed
+    _default_generation_config = GenerationConfig(
+        max_new_tokens=256,
+    )
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        requires_backends(self, "vision")
+        self.check_model_type(
+            TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES if self.framework == "tf" else MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES
+        )
+
+    def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, prompt=None, timeout=None):
+        forward_params = {}
+        preprocess_params = {}
+
+        if prompt is not None:
+            preprocess_params["prompt"] = prompt
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
+
+        if max_new_tokens is not None:
+            forward_params["max_new_tokens"] = max_new_tokens
+        if generate_kwargs is not None:
+            if max_new_tokens is not None and "max_new_tokens" in generate_kwargs:
+                raise ValueError(
+                    "`max_new_tokens` is defined both as an argument and inside `generate_kwargs` argument, please use"
+                    " only 1 version"
+                )
+            forward_params.update(generate_kwargs)
+
+        if self.assistant_model is not None:
+            forward_params["assistant_model"] = self.assistant_model
+        if self.assistant_tokenizer is not None:
+            forward_params["tokenizer"] = self.tokenizer
+            forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+        return preprocess_params, forward_params, {}
+
+    @overload
+    def __call__(self, inputs: Union[str, "Image.Image"], **kwargs: Any) -> list[dict[str, Any]]: ...
+
+    @overload
+    def __call__(self, inputs: Union[list[str], list["Image.Image"]], **kwargs: Any) -> list[list[dict[str, Any]]]: ...
+
+    def __call__(self, inputs: Union[str, list[str], "Image.Image", list["Image.Image"]], **kwargs):
+        """
+        Assign labels to the image(s) passed as inputs.
+
+        Args:
+            inputs (`str`, `list[str]`, `PIL.Image` or `list[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing a HTTP(s) link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images.
+
+            max_new_tokens (`int`, *optional*):
+                The amount of maximum tokens to generate. By default it will use `generate` default.
+
+            generate_kwargs (`Dict`, *optional*):
+                Pass it to send all of these arguments directly to `generate` allowing full control of this function.
+
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
+        Return:
+            A list or a list of list of `dict`: Each result comes as a dictionary with the following key:
+
+            - **generated_text** (`str`) -- The generated text.
+        """
+        # After deprecation of this is completed, remove the default `None` value for `images`
+        if "images" in kwargs:
+            inputs = kwargs.pop("images")
+        if inputs is None:
+            raise ValueError("Cannot call the image-to-text pipeline without an inputs argument!")
+        return super().__call__(inputs, **kwargs)
+
+    def preprocess(self, image, prompt=None, timeout=None):
+        image = load_image(image, timeout=timeout)
+
+        if prompt is not None:
+            logger.warning_once(
+                "Passing `prompt` to the `image-to-text` pipeline is deprecated and will be removed in version 4.48"
+                " of 🤗 Transformers. Use the `image-text-to-text` pipeline instead",
+            )
+            if not isinstance(prompt, str):
+                raise ValueError(
+                    f"Received an invalid text input, got - {type(prompt)} - but expected a single string. "
+                    "Note also that one single text can be provided for conditional image to text generation."
+                )
+
+            model_type = self.model.config.model_type
+
+            if model_type == "git":
+                model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+                if self.framework == "pt":
+                    model_inputs = model_inputs.to(self.dtype)
+                input_ids = self.tokenizer(text=prompt, add_special_tokens=False).input_ids
+                input_ids = [self.tokenizer.cls_token_id] + input_ids
+                input_ids = torch.tensor(input_ids).unsqueeze(0)
+                model_inputs.update({"input_ids": input_ids})
+
+            elif model_type == "pix2struct":
+                model_inputs = self.image_processor(images=image, header_text=prompt, return_tensors=self.framework)
+                if self.framework == "pt":
+                    model_inputs = model_inputs.to(self.dtype)
+
+            elif model_type != "vision-encoder-decoder":
+                # vision-encoder-decoder does not support conditional generation
+                model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+                if self.framework == "pt":
+                    model_inputs = model_inputs.to(self.dtype)
+                text_inputs = self.tokenizer(prompt, return_tensors=self.framework)
+                model_inputs.update(text_inputs)
+
+            else:
+                raise ValueError(f"Model type {model_type} does not support conditional text generation")
+
+        else:
+            model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+            if self.framework == "pt":
+                model_inputs = model_inputs.to(self.dtype)
+
+        if self.model.config.model_type == "git" and prompt is None:
+            model_inputs["input_ids"] = None
+
+        return model_inputs
+
+    def _forward(self, model_inputs, **generate_kwargs):
+        # Git model sets `model_inputs["input_ids"] = None` in `preprocess` (when `prompt=None`). In batch model, the
+        # pipeline will group them into a list of `None`, which fail `_forward`. Avoid this by checking it first.
+        if (
+            "input_ids" in model_inputs
+            and isinstance(model_inputs["input_ids"], list)
+            and all(x is None for x in model_inputs["input_ids"])
+        ):
+            model_inputs["input_ids"] = None
+
+        # User-defined `generation_config` passed to the pipeline call take precedence
+        if "generation_config" not in generate_kwargs:
+            generate_kwargs["generation_config"] = self.generation_config
+
+        # FIXME: We need to pop here due to a difference in how `generation.py` and `generation.tf_utils.py`
+        #  parse inputs. In the Tensorflow version, `generate` raises an error if we don't use `input_ids` whereas
+        #  the PyTorch version matches it with `self.model.main_input_name` or `self.model.encoder.main_input_name`
+        #  in the `_prepare_model_inputs` method.
+        inputs = model_inputs.pop(self.model.main_input_name)
+        model_outputs = self.model.generate(inputs, **model_inputs, **generate_kwargs)
+        return model_outputs
+
+    def postprocess(self, model_outputs):
+        records = []
+        for output_ids in model_outputs:
+            record = {
+                "generated_text": self.tokenizer.decode(
+                    output_ids,
+                    skip_special_tokens=True,
+                )
+            }
+            records.append(record)
+        return records
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/keypoint_matching.py b/venv/lib/python3.13/site-packages/transformers/pipelines/keypoint_matching.py
new file mode 100644
index 0000000000000000000000000000000000000000..6878f40ad98563956020b2d7cbbfc318cebfd303
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/keypoint_matching.py
@@ -0,0 +1,178 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Sequence
+from typing import Any, TypedDict, Union
+
+from typing_extensions import TypeAlias, overload
+
+from ..image_utils import is_pil_image
+from ..utils import is_vision_available, requires_backends
+from .base import Pipeline
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image
+
+
+ImagePair: TypeAlias = Sequence[Union["Image.Image", str]]
+
+
+class Keypoint(TypedDict):
+    x: float
+    y: float
+
+
+class Match(TypedDict):
+    keypoint_image_0: Keypoint
+    keypoint_image_1: Keypoint
+    score: float
+
+
+def validate_image_pairs(images: Any) -> Sequence[Sequence[ImagePair]]:
+    error_message = (
+        "Input images must be a one of the following :",
+        " - A pair of images.",
+        " - A list of pairs of images.",
+    )
+
+    def _is_valid_image(image):
+        """images is a PIL Image or a string."""
+        return is_pil_image(image) or isinstance(image, str)
+
+    if isinstance(images, Sequence):
+        if len(images) == 2 and all((_is_valid_image(image)) for image in images):
+            return [images]
+        if all(
+            isinstance(image_pair, Sequence)
+            and len(image_pair) == 2
+            and all(_is_valid_image(image) for image in image_pair)
+            for image_pair in images
+        ):
+            return images
+    raise ValueError(error_message)
+
+
+class KeypointMatchingPipeline(Pipeline):
+    """
+    Keypoint matching pipeline using any `AutoModelForKeypointMatching`. This pipeline matches keypoints between two images.
+    """
+
+    _load_processor = False
+    _load_image_processor = True
+    _load_feature_extractor = False
+    _load_tokenizer = False
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        requires_backends(self, "vision")
+        if self.framework != "pt":
+            raise ValueError("Keypoint matching pipeline only supports PyTorch (framework='pt').")
+
+    def _sanitize_parameters(self, threshold=None, timeout=None):
+        preprocess_params = {}
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
+        postprocess_params = {}
+        if threshold is not None:
+            postprocess_params["threshold"] = threshold
+        return preprocess_params, {}, postprocess_params
+
+    @overload
+    def __call__(self, inputs: ImagePair, threshold: float = 0.0, **kwargs: Any) -> list[Match]: ...
+
+    @overload
+    def __call__(self, inputs: list[ImagePair], threshold: float = 0.0, **kwargs: Any) -> list[list[Match]]: ...
+
+    def __call__(
+        self,
+        inputs: Union[list[ImagePair], ImagePair],
+        threshold: float = 0.0,
+        **kwargs: Any,
+    ) -> Union[list[Match], list[list[Match]]]:
+        """
+        Find matches between keypoints in two images.
+
+        Args:
+            inputs (`str`, `list[str]`, `PIL.Image` or `list[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing a http link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single pair of images or a batch of image pairs, which must then be passed as a string.
+                Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
+                images.
+
+            threshold (`float`, *optional*, defaults to 0.0):
+                The threshold to use for keypoint matching. Keypoints matched with a lower matching score will be filtered out.
+                A value of 0 means that all matched keypoints will be returned.
+
+            kwargs:
+                `timeout (`float`, *optional*, defaults to None)`
+                    The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                    the call may block forever.
+
+        Return:
+            Union[list[Match], list[list[Match]]]:
+                A list of matches or a list if a single image pair is provided, or of lists of matches if a batch
+                of image pairs is provided. Each match is a dictionary containing the following keys:
+
+                - **keypoint_image_0** (`Keypoint`): The keypoint in the first image (x, y coordinates).
+                - **keypoint_image_1** (`Keypoint`): The keypoint in the second image (x, y coordinates).
+                - **score** (`float`): The matching score between the two keypoints.
+        """
+        if inputs is None:
+            raise ValueError("Cannot call the keypoint-matching pipeline without an inputs argument!")
+        formatted_inputs = validate_image_pairs(inputs)
+        outputs = super().__call__(formatted_inputs, threshold=threshold, **kwargs)
+        if len(formatted_inputs) == 1:
+            return outputs[0]
+        return outputs
+
+    def preprocess(self, images, timeout=None):
+        images = [load_image(image, timeout=timeout) for image in images]
+        model_inputs = self.image_processor(images=images, return_tensors=self.framework)
+        model_inputs = model_inputs.to(self.dtype)
+        target_sizes = [image.size for image in images]
+        preprocess_outputs = {"model_inputs": model_inputs, "target_sizes": target_sizes}
+        return preprocess_outputs
+
+    def _forward(self, preprocess_outputs):
+        model_inputs = preprocess_outputs["model_inputs"]
+        model_outputs = self.model(**model_inputs)
+        forward_outputs = {"model_outputs": model_outputs, "target_sizes": [preprocess_outputs["target_sizes"]]}
+        return forward_outputs
+
+    def postprocess(self, forward_outputs, threshold=0.0) -> list[Match]:
+        model_outputs = forward_outputs["model_outputs"]
+        target_sizes = forward_outputs["target_sizes"]
+        postprocess_outputs = self.image_processor.post_process_keypoint_matching(
+            model_outputs, target_sizes=target_sizes, threshold=threshold
+        )
+        postprocess_outputs = postprocess_outputs[0]
+        pair_result = []
+        for kp_0, kp_1, score in zip(
+            postprocess_outputs["keypoints0"],
+            postprocess_outputs["keypoints1"],
+            postprocess_outputs["matching_scores"],
+        ):
+            kp_0 = Keypoint(x=kp_0[0].item(), y=kp_0[1].item())
+            kp_1 = Keypoint(x=kp_1[0].item(), y=kp_1[1].item())
+            pair_result.append(Match(keypoint_image_0=kp_0, keypoint_image_1=kp_1, score=score.item()))
+        pair_result = sorted(pair_result, key=lambda x: x["score"], reverse=True)
+        return pair_result
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/mask_generation.py b/venv/lib/python3.13/site-packages/transformers/pipelines/mask_generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..59ce0a27a6e345bfcc0b2fb0c788f4df7458e28c
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/mask_generation.py
@@ -0,0 +1,341 @@
+from collections import defaultdict
+from typing import TYPE_CHECKING, Any, Optional, Union, overload
+
+from ..image_utils import load_image
+from ..utils import (
+    add_end_docstrings,
+    is_torch_available,
+    logging,
+    requires_backends,
+)
+from .base import ChunkPipeline, build_pipeline_init_args
+
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_MASK_GENERATION_MAPPING_NAMES
+
+if TYPE_CHECKING:
+    from PIL import Image
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(
+    build_pipeline_init_args(has_image_processor=True),
+    r"""
+        points_per_batch (*optional*, int, default to 64):
+            Sets the number of points run simultaneously by the model. Higher numbers may be faster but use more GPU
+            memory.
+        output_bboxes_mask (`bool`, *optional*, default to `False`):
+            Whether or not to output the bounding box predictions.
+        output_rle_masks (`bool`, *optional*, default to `False`):
+            Whether or not to output the masks in `RLE` format""",
+)
+class MaskGenerationPipeline(ChunkPipeline):
+    """
+    Automatic mask generation for images using `SamForMaskGeneration`. This pipeline predicts binary masks for an
+    image, given an image. It is a `ChunkPipeline` because you can separate the points in a mini-batch in order to
+    avoid OOM issues. Use the `points_per_batch` argument to control the number of points that will be processed at the
+    same time. Default is `64`.
+
+    The pipeline works in 3 steps:
+        1. `preprocess`: A grid of 1024 points evenly separated is generated along with bounding boxes and point
+           labels.
+            For more details on how the points and bounding boxes are created, check the `_generate_crop_boxes`
+            function. The image is also preprocessed using the `image_processor`. This function `yields` a minibatch of
+            `points_per_batch`.
+
+        2. `forward`: feeds the outputs of `preprocess` to the model. The image embedding is computed only once.
+            Calls both `self.model.get_image_embeddings` and makes sure that the gradients are not computed, and the
+            tensors and models are on the same device.
+
+        3. `postprocess`: The most important part of the automatic mask generation happens here. Three steps
+            are induced:
+                - image_processor.postprocess_masks (run on each minibatch loop): takes in the raw output masks,
+                  resizes them according
+                to the image size, and transforms there to binary masks.
+                - image_processor.filter_masks (on each minibatch loop): uses both `pred_iou_thresh` and
+                  `stability_scores`. Also
+                applies a variety of filters based on non maximum suppression to remove bad masks.
+                - image_processor.postprocess_masks_for_amg applies the NSM on the mask to only keep relevant ones.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> generator = pipeline(model="facebook/sam-vit-base", task="mask-generation")
+    >>> outputs = generator(
+    ...     "http://images.cocodataset.org/val2017/000000039769.jpg",
+    ... )
+
+    >>> outputs = generator(
+    ...     "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", points_per_batch=128
+    ... )
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This segmentation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"mask-generation"`.
+
+    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=mask-generation).
+    """
+
+    _load_processor = False
+    _load_image_processor = True
+    _load_feature_extractor = False
+    _load_tokenizer = False
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        requires_backends(self, "vision")
+        requires_backends(self, "torch")
+
+        if self.framework != "pt":
+            raise ValueError(f"The {self.__class__} is only available in PyTorch.")
+
+        self.check_model_type(MODEL_FOR_MASK_GENERATION_MAPPING_NAMES)
+
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        postprocess_kwargs = {}
+        forward_params = {}
+        # preprocess args
+        if "points_per_batch" in kwargs:
+            preprocess_kwargs["points_per_batch"] = kwargs["points_per_batch"]
+        if "points_per_crop" in kwargs:
+            preprocess_kwargs["points_per_crop"] = kwargs["points_per_crop"]
+        if "crops_n_layers" in kwargs:
+            preprocess_kwargs["crops_n_layers"] = kwargs["crops_n_layers"]
+        if "crop_overlap_ratio" in kwargs:
+            preprocess_kwargs["crop_overlap_ratio"] = kwargs["crop_overlap_ratio"]
+        if "crop_n_points_downscale_factor" in kwargs:
+            preprocess_kwargs["crop_n_points_downscale_factor"] = kwargs["crop_n_points_downscale_factor"]
+        if "timeout" in kwargs:
+            preprocess_kwargs["timeout"] = kwargs["timeout"]
+        # postprocess args
+        if "pred_iou_thresh" in kwargs:
+            forward_params["pred_iou_thresh"] = kwargs["pred_iou_thresh"]
+        if "stability_score_offset" in kwargs:
+            forward_params["stability_score_offset"] = kwargs["stability_score_offset"]
+        if "mask_threshold" in kwargs:
+            forward_params["mask_threshold"] = kwargs["mask_threshold"]
+        if "stability_score_thresh" in kwargs:
+            forward_params["stability_score_thresh"] = kwargs["stability_score_thresh"]
+        if "max_hole_area" in kwargs:
+            forward_params["max_hole_area"] = kwargs["max_hole_area"]
+        if "max_sprinkle_area" in kwargs:
+            forward_params["max_sprinkle_area"] = kwargs["max_sprinkle_area"]
+        if "crops_nms_thresh" in kwargs:
+            postprocess_kwargs["crops_nms_thresh"] = kwargs["crops_nms_thresh"]
+        if "output_rle_mask" in kwargs:
+            postprocess_kwargs["output_rle_mask"] = kwargs["output_rle_mask"]
+        if "output_bboxes_mask" in kwargs:
+            postprocess_kwargs["output_bboxes_mask"] = kwargs["output_bboxes_mask"]
+        return preprocess_kwargs, forward_params, postprocess_kwargs
+
+    @overload
+    def __call__(self, image: Union[str, "Image.Image"], *args: Any, **kwargs: Any) -> dict[str, Any]: ...
+
+    @overload
+    def __call__(
+        self, image: Union[list[str], list["Image.Image"]], *args: Any, **kwargs: Any
+    ) -> list[dict[str, Any]]: ...
+
+    def __call__(
+        self, image: Union[str, "Image.Image", list[str], list["Image.Image"]], *args: Any, **kwargs: Any
+    ) -> Union[dict[str, Any], list[dict[str, Any]]]:
+        """
+        Generates binary segmentation masks
+
+        Args:
+            image (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+                Image or list of images.
+            mask_threshold (`float`, *optional*, defaults to 0.0):
+                Threshold to use when turning the predicted masks into binary values.
+            pred_iou_thresh (`float`, *optional*, defaults to 0.88):
+                A filtering threshold in `[0,1]` applied on the model's predicted mask quality.
+            stability_score_thresh (`float`, *optional*, defaults to 0.95):
+                A filtering threshold in `[0,1]`, using the stability of the mask under changes to the cutoff used to
+                binarize the model's mask predictions.
+            stability_score_offset (`int`, *optional*, defaults to 1):
+                The amount to shift the cutoff when calculated the stability score.
+            crops_nms_thresh (`float`, *optional*, defaults to 0.7):
+                The box IoU cutoff used by non-maximal suppression to filter duplicate masks.
+            crops_n_layers (`int`, *optional*, defaults to 0):
+                If `crops_n_layers>0`, mask prediction will be run again on crops of the image. Sets the number of
+                layers to run, where each layer has 2**i_layer number of image crops.
+            crop_overlap_ratio (`float`, *optional*, defaults to `512 / 1500`):
+                Sets the degree to which crops overlap. In the first crop layer, crops will overlap by this fraction of
+                the image length. Later layers with more crops scale down this overlap.
+            crop_n_points_downscale_factor (`int`, *optional*, defaults to `1`):
+                The number of points-per-side sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
+        Return:
+            `Dict`: A dictionary with the following keys:
+                - **mask** (`PIL.Image`) -- A binary mask of the detected object as a PIL Image of shape `(width,
+                  height)` of the original image. Returns a mask filled with zeros if no object is found.
+                - **score** (*optional* `float`) -- Optionally, when the model is capable of estimating a confidence of
+                  the "object" described by the label and the mask.
+
+        """
+        num_workers = kwargs.pop("num_workers", None)
+        batch_size = kwargs.pop("batch_size", None)
+        return super().__call__(image, *args, num_workers=num_workers, batch_size=batch_size, **kwargs)
+
+    def preprocess(
+        self,
+        image,
+        points_per_batch=64,
+        crops_n_layers: int = 0,
+        crop_overlap_ratio: float = 512 / 1500,
+        points_per_crop: int = 32,
+        crop_n_points_downscale_factor: int = 1,
+        timeout: Optional[float] = None,
+    ):
+        image = load_image(image, timeout=timeout)
+        target_size = self.image_processor.size.get("longest_edge", self.image_processor.size.get("height"))
+        crop_boxes, grid_points, cropped_images, input_labels = self.image_processor.generate_crop_boxes(
+            image, target_size, crops_n_layers, crop_overlap_ratio, points_per_crop, crop_n_points_downscale_factor
+        )
+        model_inputs = self.image_processor(images=cropped_images, return_tensors="pt")
+        if self.framework == "pt":
+            model_inputs = model_inputs.to(self.dtype)
+
+        with self.device_placement():
+            if self.framework == "pt":
+                inference_context = self.get_inference_context()
+                with inference_context():
+                    model_inputs = self._ensure_tensor_on_device(model_inputs, device=self.device)
+                    embeddings = self.model.get_image_embeddings(model_inputs.pop("pixel_values"))
+
+                    # Handle both SAM (single tensor) and SAM-HQ (tuple) outputs
+                    if isinstance(embeddings, tuple):
+                        image_embeddings, intermediate_embeddings = embeddings
+                        model_inputs["intermediate_embeddings"] = intermediate_embeddings
+                    else:
+                        image_embeddings = embeddings
+                    # TODO: Identifying the model by the type of its returned embeddings is brittle.
+                    #       Consider using a more robust method for distinguishing model types here.
+
+                    model_inputs["image_embeddings"] = image_embeddings
+
+        n_points = grid_points.shape[1]
+        points_per_batch = points_per_batch if points_per_batch is not None else n_points
+
+        if points_per_batch <= 0:
+            raise ValueError(
+                "Cannot have points_per_batch<=0. Must be >=1 to returned batched outputs. "
+                "To return all points at once, set points_per_batch to None"
+            )
+
+        for i in range(0, n_points, points_per_batch):
+            batched_points = grid_points[:, i : i + points_per_batch, :, :]
+            labels = input_labels[:, i : i + points_per_batch]
+            is_last = i == n_points - points_per_batch
+            yield {
+                "input_points": batched_points,
+                "input_labels": labels,
+                "input_boxes": crop_boxes,
+                "is_last": is_last,
+                **model_inputs,
+            }
+
+    def _forward(
+        self,
+        model_inputs,
+        pred_iou_thresh=0.88,
+        stability_score_thresh=0.95,
+        mask_threshold=0,
+        stability_score_offset=1,
+        max_hole_area=None,
+        max_sprinkle_area=None,
+    ):
+        input_boxes = model_inputs.pop("input_boxes")
+        is_last = model_inputs.pop("is_last")
+        original_sizes = model_inputs.pop("original_sizes").tolist()
+        reshaped_input_sizes = model_inputs.pop("reshaped_input_sizes").tolist()
+
+        model_outputs = self.model(**model_inputs)
+
+        # post processing happens here in order to avoid CPU GPU copies of ALL the masks
+        low_resolution_masks = model_outputs["pred_masks"]
+        postprocess_kwargs = {}
+        if max_hole_area is not None:
+            postprocess_kwargs["max_hole_area"] = max_hole_area
+        if max_sprinkle_area is not None and max_sprinkle_area > 0:
+            postprocess_kwargs["max_sprinkle_area"] = max_sprinkle_area
+        if postprocess_kwargs:
+            low_resolution_masks = self.image_processor.post_process_masks(
+                low_resolution_masks,
+                original_sizes,
+                mask_threshold=mask_threshold,
+                reshaped_input_sizes=reshaped_input_sizes,
+                binarize=False,
+                **postprocess_kwargs,
+            )
+        masks = self.image_processor.post_process_masks(
+            low_resolution_masks,
+            original_sizes,
+            mask_threshold=mask_threshold,
+            reshaped_input_sizes=reshaped_input_sizes,
+            binarize=False,
+        )
+        iou_scores = model_outputs["iou_scores"]
+        masks, iou_scores, boxes = self.image_processor.filter_masks(
+            masks[0],
+            iou_scores[0],
+            original_sizes[0],
+            input_boxes[0],
+            pred_iou_thresh,
+            stability_score_thresh,
+            mask_threshold,
+            stability_score_offset,
+        )
+        return {
+            "masks": masks,
+            "is_last": is_last,
+            "boxes": boxes,
+            "iou_scores": iou_scores,
+        }
+
+    def postprocess(
+        self,
+        model_outputs,
+        output_rle_mask=False,
+        output_bboxes_mask=False,
+        crops_nms_thresh=0.7,
+    ):
+        all_scores = []
+        all_masks = []
+        all_boxes = []
+        for model_output in model_outputs:
+            all_scores.append(model_output.pop("iou_scores"))
+            all_masks.extend(model_output.pop("masks"))
+            all_boxes.append(model_output.pop("boxes"))
+
+        all_scores = torch.cat(all_scores)
+        all_boxes = torch.cat(all_boxes)
+        output_masks, iou_scores, rle_mask, bounding_boxes = self.image_processor.post_process_for_mask_generation(
+            all_masks, all_scores, all_boxes, crops_nms_thresh
+        )
+
+        extra = defaultdict(list)
+        for output in model_outputs:
+            for k, v in output.items():
+                extra[k].append(v)
+
+        optional = {}
+        if output_rle_mask:
+            optional["rle_mask"] = rle_mask
+
+        if output_bboxes_mask:
+            optional["bounding_boxes"] = bounding_boxes
+
+        return {"masks": output_masks, "scores": iou_scores, **optional, **extra}
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/object_detection.py b/venv/lib/python3.13/site-packages/transformers/pipelines/object_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..0db67f84d248c3f399460566de9d3de870c51e7d
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/object_detection.py
@@ -0,0 +1,203 @@
+from typing import TYPE_CHECKING, Any, Union, overload
+
+from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from ..image_utils import load_image
+
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import (
+        MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES,
+        MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
+    )
+
+if TYPE_CHECKING:
+    from PIL import Image
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
+class ObjectDetectionPipeline(Pipeline):
+    """
+    Object detection pipeline using any `AutoModelForObjectDetection`. This pipeline predicts bounding boxes of objects
+    and their classes.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> detector = pipeline(model="facebook/detr-resnet-50")
+    >>> detector("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
+    [{'score': 0.997, 'label': 'bird', 'box': {'xmin': 69, 'ymin': 171, 'xmax': 396, 'ymax': 507}}, {'score': 0.999, 'label': 'bird', 'box': {'xmin': 398, 'ymin': 105, 'xmax': 767, 'ymax': 507}}]
+
+    >>> # x, y  are expressed relative to the top left hand corner.
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This object detection pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"object-detection"`.
+
+    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=object-detection).
+    """
+
+    _load_processor = False
+    _load_image_processor = True
+    _load_feature_extractor = False
+    _load_tokenizer = None
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        if self.framework == "tf":
+            raise ValueError(f"The {self.__class__} is only available in PyTorch.")
+
+        requires_backends(self, "vision")
+        mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES.copy()
+        mapping.update(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES)
+        self.check_model_type(mapping)
+
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_params = {}
+        if "timeout" in kwargs:
+            preprocess_params["timeout"] = kwargs["timeout"]
+        postprocess_kwargs = {}
+        if "threshold" in kwargs:
+            postprocess_kwargs["threshold"] = kwargs["threshold"]
+        return preprocess_params, {}, postprocess_kwargs
+
+    @overload
+    def __call__(self, image: Union[str, "Image.Image"], *args: Any, **kwargs: Any) -> list[dict[str, Any]]: ...
+
+    @overload
+    def __call__(
+        self, image: Union[list[str], list["Image.Image"]], *args: Any, **kwargs: Any
+    ) -> list[list[dict[str, Any]]]: ...
+
+    def __call__(self, *args, **kwargs) -> Union[list[dict[str, Any]], list[list[dict[str, Any]]]]:
+        """
+        Detect objects (bounding boxes & classes) in the image(s) passed as inputs.
+
+        Args:
+            inputs (`str`, `list[str]`, `PIL.Image` or `list[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing an HTTP(S) link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images. Images in a batch must all be in the
+                same format: all as HTTP(S) links, all as local paths, or all as PIL images.
+            threshold (`float`, *optional*, defaults to 0.5):
+                The probability necessary to make a prediction.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
+        Return:
+            A list of dictionaries or a list of list of dictionaries containing the result. If the input is a single
+            image, will return a list of dictionaries, if the input is a list of several images, will return a list of
+            list of dictionaries corresponding to each image.
+
+            The dictionaries contain the following keys:
+
+            - **label** (`str`) -- The class label identified by the model.
+            - **score** (`float`) -- The score attributed by the model for that label.
+            - **box** (`list[dict[str, int]]`) -- The bounding box of detected object in image's original size.
+        """
+        # After deprecation of this is completed, remove the default `None` value for `images`
+        if "images" in kwargs and "inputs" not in kwargs:
+            kwargs["inputs"] = kwargs.pop("images")
+        return super().__call__(*args, **kwargs)
+
+    def preprocess(self, image, timeout=None):
+        image = load_image(image, timeout=timeout)
+        target_size = torch.IntTensor([[image.height, image.width]])
+        inputs = self.image_processor(images=[image], return_tensors="pt")
+        if self.framework == "pt":
+            inputs = inputs.to(self.dtype)
+        if self.tokenizer is not None:
+            inputs = self.tokenizer(text=inputs["words"], boxes=inputs["boxes"], return_tensors="pt")
+        inputs["target_size"] = target_size
+        return inputs
+
+    def _forward(self, model_inputs):
+        target_size = model_inputs.pop("target_size")
+        outputs = self.model(**model_inputs)
+        model_outputs = outputs.__class__({"target_size": target_size, **outputs})
+        if self.tokenizer is not None:
+            model_outputs["bbox"] = model_inputs["bbox"]
+        return model_outputs
+
+    def postprocess(self, model_outputs, threshold=0.5):
+        target_size = model_outputs["target_size"]
+        if self.tokenizer is not None:
+            # This is a LayoutLMForTokenClassification variant.
+            # The OCR got the boxes and the model classified the words.
+            height, width = target_size[0].tolist()
+
+            def unnormalize(bbox):
+                return self._get_bounding_box(
+                    torch.Tensor(
+                        [
+                            (width * bbox[0] / 1000),
+                            (height * bbox[1] / 1000),
+                            (width * bbox[2] / 1000),
+                            (height * bbox[3] / 1000),
+                        ]
+                    )
+                )
+
+            scores, classes = model_outputs["logits"].squeeze(0).softmax(dim=-1).max(dim=-1)
+            labels = [self.model.config.id2label[prediction] for prediction in classes.tolist()]
+            boxes = [unnormalize(bbox) for bbox in model_outputs["bbox"].squeeze(0)]
+            keys = ["score", "label", "box"]
+            annotation = [dict(zip(keys, vals)) for vals in zip(scores.tolist(), labels, boxes) if vals[0] > threshold]
+        else:
+            # This is a regular ForObjectDetectionModel
+            raw_annotations = self.image_processor.post_process_object_detection(model_outputs, threshold, target_size)
+            raw_annotation = raw_annotations[0]
+            scores = raw_annotation["scores"]
+            labels = raw_annotation["labels"]
+            boxes = raw_annotation["boxes"]
+
+            raw_annotation["scores"] = scores.tolist()
+            raw_annotation["labels"] = [self.model.config.id2label[label.item()] for label in labels]
+            raw_annotation["boxes"] = [self._get_bounding_box(box) for box in boxes]
+
+            # {"scores": [...], ...} --> [{"score":x, ...}, ...]
+            keys = ["score", "label", "box"]
+            annotation = [
+                dict(zip(keys, vals))
+                for vals in zip(raw_annotation["scores"], raw_annotation["labels"], raw_annotation["boxes"])
+            ]
+
+        return annotation
+
+    def _get_bounding_box(self, box: "torch.Tensor") -> dict[str, int]:
+        """
+        Turns list [xmin, xmax, ymin, ymax] into dict { "xmin": xmin, ... }
+
+        Args:
+            box (`torch.Tensor`): Tensor containing the coordinates in corners format.
+
+        Returns:
+            bbox (`dict[str, int]`): Dict containing the coordinates in corners format.
+        """
+        if self.framework != "pt":
+            raise ValueError("The ObjectDetectionPipeline is only available in PyTorch.")
+        xmin, ymin, xmax, ymax = box.int().tolist()
+        bbox = {
+            "xmin": xmin,
+            "ymin": ymin,
+            "xmax": xmax,
+            "ymax": ymax,
+        }
+        return bbox
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/pt_utils.py b/venv/lib/python3.13/site-packages/transformers/pipelines/pt_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3857805962a93f796e407ed88c89495b46fd0bd0
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/pt_utils.py
@@ -0,0 +1,323 @@
+import numpy as np
+import torch
+from torch.utils.data import Dataset, IterableDataset
+
+from ..utils.generic import ModelOutput
+
+
+class PipelineDataset(Dataset):
+    def __init__(self, dataset, process, params):
+        self.dataset = dataset
+        self.process = process
+        self.params = params
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, i):
+        item = self.dataset[i]
+        processed = self.process(item, **self.params)
+        return processed
+
+
+class PipelineIterator(IterableDataset):
+    def __init__(self, loader, infer, params, loader_batch_size=None):
+        """
+        Roughly equivalent to
+
+        ```
+        for item in loader:
+            yield infer(item, **params)
+        ```
+
+                Arguments:
+                    loader (`torch.utils.data.DataLoader` or `Iterable`):
+                        The iterator that will be used to apply `infer` on.
+                    infer (any function):
+                        The function to apply of each element of `loader`.
+                    params (`dict`):
+                        The parameters passed to `infer` along with every item
+                    loader_batch_size (`int`, *optional*):
+                        If specified, the items of `loader` are supposed to come as batch, and are loader_batched here
+                        making it roughly behave as
+
+
+        ```
+        for items in loader:
+            for i in loader_batch_size:
+                item = items[i]
+                yield infer(item, **params)
+        ```"""
+        self.loader = loader
+        self.infer = infer
+        self.params = params
+        if loader_batch_size == 1:
+            # Let's spare some time by deactivating altogether
+            loader_batch_size = None
+        self.loader_batch_size = loader_batch_size
+
+        # Internal bookkeeping
+        self._loader_batch_index = None
+        self._loader_batch_data = None
+
+    def __len__(self):
+        return len(self.loader)
+
+    def __iter__(self):
+        self.iterator = iter(self.loader)
+        return self
+
+    def loader_batch_item(self):
+        """
+        Return item located at `loader_batch_index` within the current `loader_batch_data`.
+        """
+        if isinstance(self._loader_batch_data, torch.Tensor):
+            # Batch data is simple tensor, just fetch the slice
+            result = self._loader_batch_data[self._loader_batch_index].unsqueeze(0)
+        else:
+            # Batch data is assumed to be BaseModelOutput (or dict)
+            loader_batched = {}
+            for k, element in self._loader_batch_data.items():
+                if isinstance(element, ModelOutput):
+                    # Convert ModelOutput to tuple first
+                    element = element.to_tuple()
+                    if isinstance(element[0], torch.Tensor):
+                        loader_batched[k] = tuple(el[self._loader_batch_index].unsqueeze(0) for el in element)
+                    elif isinstance(element[0], np.ndarray):
+                        loader_batched[k] = tuple(np.expand_dims(el[self._loader_batch_index], 0) for el in element)
+                    continue
+                if k in {"hidden_states", "attentions"} and isinstance(element, tuple):
+                    # Those are stored as lists of tensors so need specific unbatching.
+                    if isinstance(element[0], torch.Tensor):
+                        loader_batched[k] = tuple(el[self._loader_batch_index].unsqueeze(0) for el in element)
+                    elif isinstance(element[0], np.ndarray):
+                        loader_batched[k] = tuple(np.expand_dims(el[self._loader_batch_index], 0) for el in element)
+                    continue
+                if k == "past_key_values":
+                    continue
+                if element is None:
+                    # This can happen for optional data that get passed around
+                    loader_batched[k] = None
+                elif isinstance(element[self._loader_batch_index], torch.Tensor):
+                    # Take correct batch data, but make it looked like batch_size=1
+                    # For compatibility with other methods within transformers
+
+                    loader_batched[k] = element[self._loader_batch_index].unsqueeze(0)
+                elif isinstance(element[self._loader_batch_index], np.ndarray):
+                    # Take correct batch data, but make it looked like batch_size=1
+                    # For compatibility with other methods within transformers
+                    loader_batched[k] = np.expand_dims(element[self._loader_batch_index], 0)
+                else:
+                    # This is typically a list, so no need to `unsqueeze`.
+                    loader_batched[k] = element[self._loader_batch_index]
+            # Recreate the element by reusing the original class to make it look
+            # batch_size=1
+            result = self._loader_batch_data.__class__(loader_batched)
+        self._loader_batch_index += 1
+        return result
+
+    def __next__(self):
+        if self._loader_batch_index is not None and self._loader_batch_index < self.loader_batch_size:
+            # We are currently unrolling a batch so we just need to return
+            # the current item within a batch
+            return self.loader_batch_item()
+
+        # We're out of items within a batch
+        item = next(self.iterator)
+        processed = self.infer(item, **self.params)
+        # We now have a batch of "inferred things".
+        if self.loader_batch_size is not None:
+            # Try to infer the size of the batch
+            if isinstance(processed, torch.Tensor):
+                first_tensor = processed
+            elif isinstance(processed, tuple):
+                first_tensor = processed[0]
+            else:
+                key = list(processed.keys())[0]
+                first_tensor = processed[key]
+
+            if isinstance(first_tensor, list):
+                observed_batch_size = len(first_tensor)
+            else:
+                observed_batch_size = first_tensor.shape[0]
+            if 0 < observed_batch_size < self.loader_batch_size:
+                # could be last batch so we can't unroll as many
+                # elements.
+                self.loader_batch_size = observed_batch_size
+            # Setting internal index to unwrap the batch
+            self._loader_batch_data = processed[0] if isinstance(processed, tuple) else processed
+            self._loader_batch_index = 0
+            return self.loader_batch_item()
+        else:
+            # We're not unrolling batches
+            return processed
+
+
+class PipelineChunkIterator(PipelineIterator):
+    def __init__(self, loader, infer, params, loader_batch_size=None):
+        """
+        Roughly equivalent to
+
+        ```
+        for iterator in loader:
+            for item in iterator:
+                yield infer(item, **params)
+        ```
+
+                Arguments:
+                    loader (`torch.utils.data.DataLoader` or `Iterable`):
+                        The iterator that will be used to apply `infer` on.
+                    infer (any function):
+                        The function to apply of each element of `loader`.
+                    params (`dict`):
+                        The parameters passed to `infer` along with every item
+        """
+        super().__init__(loader, infer, params)
+
+    def __iter__(self):
+        self.iterator = iter(self.loader)
+        self.subiterator = None
+        return self
+
+    def __next__(self):
+        if self.subiterator is None:
+            "Subiterator None means we haven't started a `preprocess` iterator. so start it"
+            self.subiterator = self.infer(next(self.iterator), **self.params)
+        try:
+            # Try to return next item
+            processed = next(self.subiterator)
+        except StopIteration:
+            # When a preprocess iterator ends, we can start looking at the next item
+            # ChunkIterator will keep feeding until ALL elements of iterator
+            # all have created their subiterator and have been iterating against.
+            #
+            # Another way to look at it, is we're basically flattening lists of lists
+            # into a single list, but with generators
+            self.subiterator = self.infer(next(self.iterator), **self.params)
+            processed = next(self.subiterator)
+        return processed
+
+
+class PipelinePackIterator(PipelineIterator):
+    """
+    Roughly equivalent to
+
+    ```
+    packed =  []
+    for item in loader:
+        packed.append(item)
+        if item["is_last"]:
+            yield packed
+            packed = []
+    ```
+
+        but it also handles cases where `item` are batched (meaning it's a dict of Tensor with first dimension > 1. In
+        that case it does
+
+    ```
+    packed =  []
+    for batch in loader:
+        # item is batched
+        for item in batch:
+            packed.append(item)
+            if item["is_last"]:
+                yield packed
+                packed = []
+    ```
+
+        Arguments:
+            loader (`torch.utils.data.DataLoader` or `Iterable`):
+                The iterator that will be used to apply `infer` on.
+            infer (any function):
+                The function to apply of each element of `loader`.
+            params (`dict`):
+                The parameters passed to `infer` along with every item
+            loader_batch_size (`int`, *optional*):
+                If specified, the items of `loader` are supposed to come as batch, and are loader_batched here making
+                it roughly behave as
+
+
+    ```
+    for items in loader:
+        for i in loader_batch_size:
+            item = items[i]
+            yield infer(item, **params)
+    ```"""
+
+    def __iter__(self):
+        self.iterator = iter(self.loader)
+        return self
+
+    def __next__(self):
+        # Extremely similar to PipelineIterator in its unpacking mechanism
+        # BUT, we have an extra required item which is the presence of `is_last`
+        # That is because everything is flattened by `PipelineChunkIterator` we
+        # need to keep track of how to regroup here in the original `process`
+        # boundaries so that `process` and `postprocess` see the same data.
+
+        # This iterator accumulates items (possibly while unbatching) until it
+        # its a `is_last` and then just passes it on to the caller.
+        is_last = False
+        accumulator = []
+        if self._loader_batch_index is not None and self._loader_batch_index < self.loader_batch_size:
+            while self._loader_batch_index < self.loader_batch_size:
+                item = self.loader_batch_item()
+                is_last = item.pop("is_last")
+                accumulator.append(item)
+                if is_last:
+                    return accumulator
+
+        while not is_last:
+            processed = self.infer(next(self.iterator), **self.params)
+            if self.loader_batch_size is not None:
+                if isinstance(processed, torch.Tensor):
+                    first_tensor = processed
+                else:
+                    key = list(processed.keys())[0]
+                    first_tensor = processed[key]
+                if isinstance(first_tensor, list):
+                    observed_batch_size = len(first_tensor)
+                else:
+                    observed_batch_size = first_tensor.shape[0]
+                if 0 < observed_batch_size < self.loader_batch_size:
+                    # could be last batch so we can't unroll as many
+                    # elements.
+                    self.loader_batch_size = observed_batch_size
+                self._loader_batch_data = processed
+                self._loader_batch_index = 0
+                while self._loader_batch_index < self.loader_batch_size:
+                    item = self.loader_batch_item()
+                    is_last = item.pop("is_last")
+                    accumulator.append(item)
+                    if is_last:
+                        return accumulator
+            else:
+                item = processed
+                is_last = item.pop("is_last")
+                accumulator.append(item)
+        return accumulator
+
+
+class KeyDataset(Dataset):
+    def __init__(self, dataset: Dataset, key: str):
+        self.dataset = dataset
+        self.key = key
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, i):
+        return self.dataset[i][self.key]
+
+
+class KeyPairDataset(Dataset):
+    def __init__(self, dataset: Dataset, key1: str, key2: str):
+        self.dataset = dataset
+        self.key1 = key1
+        self.key2 = key2
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, i):
+        return {"text": self.dataset[i][self.key1], "text_pair": self.dataset[i][self.key2]}
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/question_answering.py b/venv/lib/python3.13/site-packages/transformers/pipelines/question_answering.py
new file mode 100644
index 0000000000000000000000000000000000000000..1958fbd1fcc8420b1f45508a5fbe5e5c8949d4ac
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/question_answering.py
@@ -0,0 +1,707 @@
+import inspect
+import types
+import warnings
+from collections.abc import Iterable
+from typing import TYPE_CHECKING, Optional, Union
+
+import numpy as np
+
+from ..data import SquadExample, SquadFeatures, squad_convert_examples_to_features
+from ..modelcard import ModelCard
+from ..tokenization_utils import PreTrainedTokenizer
+from ..utils import (
+    PaddingStrategy,
+    add_end_docstrings,
+    is_tf_available,
+    is_tokenizers_available,
+    is_torch_available,
+    logging,
+)
+from .base import ArgumentHandler, ChunkPipeline, build_pipeline_init_args
+
+
+logger = logging.get_logger(__name__)
+
+if TYPE_CHECKING:
+    from ..modeling_tf_utils import TFPreTrainedModel
+    from ..modeling_utils import PreTrainedModel
+
+    if is_tokenizers_available():
+        import tokenizers
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
+
+    Dataset = None
+
+if is_torch_available():
+    import torch
+    from torch.utils.data import Dataset
+
+    from ..models.auto.modeling_auto import MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
+
+
+def decode_spans(
+    start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int, undesired_tokens: np.ndarray
+) -> tuple:
+    """
+    Take the output of any `ModelForQuestionAnswering` and will generate probabilities for each span to be the actual
+    answer.
+
+    In addition, it filters out some unwanted/impossible cases like answer len being greater than max_answer_len or
+    answer end position being before the starting position. The method supports output the k-best answer through the
+    topk argument.
+
+    Args:
+        start (`np.ndarray`): Individual start probabilities for each token.
+        end (`np.ndarray`): Individual end probabilities for each token.
+        topk (`int`): Indicates how many possible answer span(s) to extract from the model output.
+        max_answer_len (`int`): Maximum size of the answer to extract from the model's output.
+        undesired_tokens (`np.ndarray`): Mask determining tokens that can be part of the answer
+    """
+    # Ensure we have batch axis
+    if start.ndim == 1:
+        start = start[None]
+
+    if end.ndim == 1:
+        end = end[None]
+
+    # Compute the score of each tuple(start, end) to be the real answer
+    outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1))
+
+    # Remove candidate with end < start and end - start > max_answer_len
+    candidates = np.tril(np.triu(outer), max_answer_len - 1)
+
+    #  Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
+    scores_flat = candidates.flatten()
+    if topk == 1:
+        idx_sort = [np.argmax(scores_flat)]
+    elif len(scores_flat) < topk:
+        idx_sort = np.argsort(-scores_flat)
+    else:
+        idx = np.argpartition(-scores_flat, topk)[0:topk]
+        idx_sort = idx[np.argsort(-scores_flat[idx])]
+
+    starts, ends = np.unravel_index(idx_sort, candidates.shape)[1:]
+    desired_spans = np.isin(starts, undesired_tokens.nonzero()) & np.isin(ends, undesired_tokens.nonzero())
+    starts = starts[desired_spans]
+    ends = ends[desired_spans]
+    scores = candidates[0, starts, ends]
+
+    return starts, ends, scores
+
+
+def select_starts_ends(
+    start,
+    end,
+    p_mask,
+    attention_mask,
+    min_null_score=1000000,
+    top_k=1,
+    handle_impossible_answer=False,
+    max_answer_len=15,
+):
+    """
+    Takes the raw output of any `ModelForQuestionAnswering` and first normalizes its outputs and then uses
+    `decode_spans()` to generate probabilities for each span to be the actual answer.
+
+    Args:
+        start (`np.ndarray`): Individual start logits for each token.
+        end (`np.ndarray`): Individual end logits for each token.
+        p_mask (`np.ndarray`): A mask with 1 for values that cannot be in the answer
+        attention_mask (`np.ndarray`): The attention mask generated by the tokenizer
+        min_null_score(`float`): The minimum null (empty) answer score seen so far.
+        topk (`int`): Indicates how many possible answer span(s) to extract from the model output.
+        handle_impossible_answer(`bool`): Whether to allow null (empty) answers
+        max_answer_len (`int`): Maximum size of the answer to extract from the model's output.
+    """
+    # Ensure padded tokens & question tokens cannot belong to the set of candidate answers.
+    undesired_tokens = np.abs(np.array(p_mask) - 1)
+
+    if attention_mask is not None:
+        undesired_tokens = undesired_tokens & attention_mask
+
+    # Generate mask
+    undesired_tokens_mask = undesired_tokens == 0.0
+
+    # Make sure non-context indexes in the tensor cannot contribute to the softmax
+    start = np.where(undesired_tokens_mask, -10000.0, start)
+    end = np.where(undesired_tokens_mask, -10000.0, end)
+
+    # Normalize logits and spans to retrieve the answer
+    start = np.exp(start - start.max(axis=-1, keepdims=True))
+    start = start / start.sum()
+
+    end = np.exp(end - end.max(axis=-1, keepdims=True))
+    end = end / end.sum()
+
+    if handle_impossible_answer:
+        min_null_score = min(min_null_score, (start[0, 0] * end[0, 0]).item())
+
+    # Mask CLS
+    start[0, 0] = end[0, 0] = 0.0
+
+    starts, ends, scores = decode_spans(start, end, top_k, max_answer_len, undesired_tokens)
+    return starts, ends, scores, min_null_score
+
+
+class QuestionAnsweringArgumentHandler(ArgumentHandler):
+    """
+    QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped to
+    internal [`SquadExample`].
+
+    QuestionAnsweringArgumentHandler manages all the possible to create a [`SquadExample`] from the command-line
+    supplied arguments.
+    """
+
+    _load_processor = False
+    _load_image_processor = False
+    _load_feature_extractor = False
+    _load_tokenizer = True
+
+    def normalize(self, item):
+        if isinstance(item, SquadExample):
+            return item
+        elif isinstance(item, dict):
+            for k in ["question", "context"]:
+                if k not in item:
+                    raise KeyError("You need to provide a dictionary with keys {question:..., context:...}")
+                elif item[k] is None:
+                    raise ValueError(f"`{k}` cannot be None")
+                elif isinstance(item[k], str) and len(item[k]) == 0:
+                    raise ValueError(f"`{k}` cannot be empty")
+
+            return QuestionAnsweringPipeline.create_sample(**item)
+        raise ValueError(f"{item} argument needs to be of type (SquadExample, dict)")
+
+    def __call__(self, *args, **kwargs):
+        # Detect where the actual inputs are
+        if args is not None and len(args) > 0:
+            if len(args) == 1:
+                inputs = args[0]
+            elif len(args) == 2 and {type(el) for el in args} == {str}:
+                inputs = [{"question": args[0], "context": args[1]}]
+            else:
+                inputs = list(args)
+        # Generic compatibility with sklearn and Keras
+        # Batched data
+        elif "X" in kwargs:
+            warnings.warn(
+                "Passing the `X` argument to the pipeline is deprecated and will be removed in v5. Inputs should be passed using the `question` and `context` keyword arguments instead.",
+                FutureWarning,
+            )
+            inputs = kwargs["X"]
+        elif "data" in kwargs:
+            warnings.warn(
+                "Passing the `data` argument to the pipeline is deprecated and will be removed in v5. Inputs should be passed using the `question` and `context` keyword arguments instead.",
+                FutureWarning,
+            )
+            inputs = kwargs["data"]
+        elif "question" in kwargs and "context" in kwargs:
+            if isinstance(kwargs["question"], list) and isinstance(kwargs["context"], str):
+                inputs = [{"question": Q, "context": kwargs["context"]} for Q in kwargs["question"]]
+            elif isinstance(kwargs["question"], list) and isinstance(kwargs["context"], list):
+                if len(kwargs["question"]) != len(kwargs["context"]):
+                    raise ValueError("Questions and contexts don't have the same lengths")
+
+                inputs = [{"question": Q, "context": C} for Q, C in zip(kwargs["question"], kwargs["context"])]
+            elif isinstance(kwargs["question"], str) and isinstance(kwargs["context"], str):
+                inputs = [{"question": kwargs["question"], "context": kwargs["context"]}]
+            else:
+                raise ValueError("Arguments can't be understood")
+        else:
+            raise ValueError(f"Unknown arguments {kwargs}")
+
+        # When user is sending a generator we need to trust it's a valid example
+        generator_types = (types.GeneratorType, Dataset) if Dataset is not None else (types.GeneratorType,)
+        if isinstance(inputs, generator_types):
+            return inputs
+
+        # Normalize inputs
+        if isinstance(inputs, dict):
+            inputs = [inputs]
+        elif isinstance(inputs, Iterable):
+            # Copy to avoid overriding arguments
+            inputs = list(inputs)
+        else:
+            raise ValueError(f"Invalid arguments {kwargs}")
+
+        for i, item in enumerate(inputs):
+            inputs[i] = self.normalize(item)
+
+        return inputs
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
+class QuestionAnsweringPipeline(ChunkPipeline):
+    """
+    Question Answering pipeline using any `ModelForQuestionAnswering`. See the [question answering
+    examples](../task_summary#question-answering) for more information.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> oracle = pipeline(model="deepset/roberta-base-squad2")
+    >>> oracle(question="Where do I live?", context="My name is Wolfgang and I live in Berlin")
+    {'score': 0.9191, 'start': 34, 'end': 40, 'answer': 'Berlin'}
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This question answering pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"question-answering"`.
+
+    The models that this pipeline can use are models that have been fine-tuned on a question answering task. See the
+    up-to-date list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=question-answering).
+    """
+
+    default_input_names = "question,context"
+    handle_impossible_answer = False
+
+    def __init__(
+        self,
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        tokenizer: PreTrainedTokenizer,
+        modelcard: Optional[ModelCard] = None,
+        framework: Optional[str] = None,
+        task: str = "",
+        **kwargs,
+    ):
+        super().__init__(
+            model=model,
+            tokenizer=tokenizer,
+            modelcard=modelcard,
+            framework=framework,
+            task=task,
+            **kwargs,
+        )
+
+        self._args_parser = QuestionAnsweringArgumentHandler()
+        self.check_model_type(
+            TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
+            if self.framework == "tf"
+            else MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
+        )
+
+    @staticmethod
+    def create_sample(
+        question: Union[str, list[str]], context: Union[str, list[str]]
+    ) -> Union[SquadExample, list[SquadExample]]:
+        """
+        QuestionAnsweringPipeline leverages the [`SquadExample`] internally. This helper method encapsulate all the
+        logic for converting question(s) and context(s) to [`SquadExample`].
+
+        We currently support extractive question answering.
+
+        Arguments:
+            question (`str` or `list[str]`): The question(s) asked.
+            context (`str` or `list[str]`): The context(s) in which we will look for the answer.
+
+        Returns:
+            One or a list of [`SquadExample`]: The corresponding [`SquadExample`] grouping question and context.
+        """
+        if isinstance(question, list):
+            return [SquadExample(None, q, c, None, None, None) for q, c in zip(question, context)]
+        else:
+            return SquadExample(None, question, context, None, None, None)
+
+    def _sanitize_parameters(
+        self,
+        padding=None,
+        topk=None,
+        top_k=None,
+        doc_stride=None,
+        max_answer_len=None,
+        max_seq_len=None,
+        max_question_len=None,
+        handle_impossible_answer=None,
+        align_to_words=None,
+        **kwargs,
+    ):
+        # Set defaults values
+        preprocess_params = {}
+        if padding is not None:
+            preprocess_params["padding"] = padding
+        if doc_stride is not None:
+            preprocess_params["doc_stride"] = doc_stride
+        if max_question_len is not None:
+            preprocess_params["max_question_len"] = max_question_len
+        if max_seq_len is not None:
+            preprocess_params["max_seq_len"] = max_seq_len
+
+        postprocess_params = {}
+        if topk is not None and top_k is None:
+            warnings.warn("topk parameter is deprecated, use top_k instead", UserWarning)
+            top_k = topk
+        if top_k is not None:
+            if top_k < 1:
+                raise ValueError(f"top_k parameter should be >= 1 (got {top_k})")
+            postprocess_params["top_k"] = top_k
+        if max_answer_len is not None:
+            if max_answer_len < 1:
+                raise ValueError(f"max_answer_len parameter should be >= 1 (got {max_answer_len}")
+            postprocess_params["max_answer_len"] = max_answer_len
+        if handle_impossible_answer is not None:
+            postprocess_params["handle_impossible_answer"] = handle_impossible_answer
+        if align_to_words is not None:
+            postprocess_params["align_to_words"] = align_to_words
+        return preprocess_params, {}, postprocess_params
+
+    def __call__(self, *args, **kwargs):
+        """
+        Answer the question(s) given as inputs by using the context(s).
+
+        Args:
+            question (`str` or `list[str]`):
+                One or several question(s) (must be used in conjunction with the `context` argument).
+            context (`str` or `list[str]`):
+                One or several context(s) associated with the question(s) (must be used in conjunction with the
+                `question` argument).
+            top_k (`int`, *optional*, defaults to 1):
+                The number of answers to return (will be chosen by order of likelihood). Note that we return less than
+                top_k answers if there are not enough options available within the context.
+            doc_stride (`int`, *optional*, defaults to 128):
+                If the context is too long to fit with the question for the model, it will be split in several chunks
+                with some overlap. This argument controls the size of that overlap.
+            max_answer_len (`int`, *optional*, defaults to 15):
+                The maximum length of predicted answers (e.g., only answers with a shorter length are considered).
+            max_seq_len (`int`, *optional*, defaults to 384):
+                The maximum length of the total sentence (context + question) in tokens of each chunk passed to the
+                model. The context will be split in several chunks (using `doc_stride` as overlap) if needed.
+            max_question_len (`int`, *optional*, defaults to 64):
+                The maximum length of the question after tokenization. It will be truncated if needed.
+            handle_impossible_answer (`bool`, *optional*, defaults to `False`):
+                Whether or not we accept impossible as an answer.
+            align_to_words (`bool`, *optional*, defaults to `True`):
+                Attempts to align the answer to real words. Improves quality on space separated languages. Might hurt on
+                non-space-separated languages (like Japanese or Chinese)
+
+        Return:
+            A `dict` or a list of `dict`: Each result comes as a dictionary with the following keys:
+
+            - **score** (`float`) -- The probability associated to the answer.
+            - **start** (`int`) -- The character start index of the answer (in the tokenized version of the input).
+            - **end** (`int`) -- The character end index of the answer (in the tokenized version of the input).
+            - **answer** (`str`) -- The answer to the question.
+        """
+
+        # Convert inputs to features
+        if args:
+            warnings.warn(
+                "Passing a list of SQuAD examples to the pipeline is deprecated and will be removed in v5. Inputs should be passed using the `question` and `context` keyword arguments instead.",
+                FutureWarning,
+            )
+
+        examples = self._args_parser(*args, **kwargs)
+        if isinstance(examples, (list, tuple)) and len(examples) == 1:
+            return super().__call__(examples[0], **kwargs)
+        return super().__call__(examples, **kwargs)
+
+    def preprocess(self, example, padding="do_not_pad", doc_stride=None, max_question_len=64, max_seq_len=None):
+        # XXX: This is special, args_parser will not handle anything generator or dataset like
+        # For those we expect user to send a simple valid example either directly as a SquadExample or simple dict.
+        # So we still need a little sanitation here.
+        if isinstance(example, dict):
+            example = SquadExample(None, example["question"], example["context"], None, None, None)
+
+        if max_seq_len is None:
+            max_seq_len = min(self.tokenizer.model_max_length, 384)
+        if doc_stride is None:
+            doc_stride = min(max_seq_len // 2, 128)
+
+        if doc_stride > max_seq_len:
+            raise ValueError(f"`doc_stride` ({doc_stride}) is larger than `max_seq_len` ({max_seq_len})")
+
+        if not self.tokenizer.is_fast:
+            features = squad_convert_examples_to_features(
+                examples=[example],
+                tokenizer=self.tokenizer,
+                max_seq_length=max_seq_len,
+                doc_stride=doc_stride,
+                max_query_length=max_question_len,
+                padding_strategy=PaddingStrategy.MAX_LENGTH,
+                is_training=False,
+                tqdm_enabled=False,
+            )
+        else:
+            # Define the side we want to truncate / pad and the text/pair sorting
+            question_first = self.tokenizer.padding_side == "right"
+
+            encoded_inputs = self.tokenizer(
+                text=example.question_text if question_first else example.context_text,
+                text_pair=example.context_text if question_first else example.question_text,
+                padding=padding,
+                truncation="only_second" if question_first else "only_first",
+                max_length=max_seq_len,
+                stride=doc_stride,
+                return_token_type_ids=True,
+                return_overflowing_tokens=True,
+                return_offsets_mapping=True,
+                return_special_tokens_mask=True,
+            )
+            # When the input is too long, it's converted in a batch of inputs with overflowing tokens
+            # and a stride of overlap between the inputs. If a batch of inputs is given, a special output
+            # "overflow_to_sample_mapping" indicate which member of the encoded batch belong to which original batch sample.
+            # Here we tokenize examples one-by-one so we don't need to use "overflow_to_sample_mapping".
+            # "num_span" is the number of output samples generated from the overflowing tokens.
+            num_spans = len(encoded_inputs["input_ids"])
+
+            # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
+            # We put 0 on the tokens from the context and 1 everywhere else (question and special tokens)
+            p_mask = [
+                [tok != 1 if question_first else 0 for tok in encoded_inputs.sequence_ids(span_id)]
+                for span_id in range(num_spans)
+            ]
+
+            features = []
+            for span_idx in range(num_spans):
+                input_ids_span_idx = encoded_inputs["input_ids"][span_idx]
+                attention_mask_span_idx = (
+                    encoded_inputs["attention_mask"][span_idx] if "attention_mask" in encoded_inputs else None
+                )
+                token_type_ids_span_idx = (
+                    encoded_inputs["token_type_ids"][span_idx] if "token_type_ids" in encoded_inputs else None
+                )
+                # keep the cls_token unmasked (some models use it to indicate unanswerable questions)
+                if self.tokenizer.cls_token_id is not None:
+                    cls_indices = np.nonzero(np.array(input_ids_span_idx) == self.tokenizer.cls_token_id)[0]
+                    for cls_index in cls_indices:
+                        p_mask[span_idx][cls_index] = 0
+                submask = p_mask[span_idx]
+                features.append(
+                    SquadFeatures(
+                        input_ids=input_ids_span_idx,
+                        attention_mask=attention_mask_span_idx,
+                        token_type_ids=token_type_ids_span_idx,
+                        p_mask=submask,
+                        encoding=encoded_inputs[span_idx],
+                        # We don't use the rest of the values - and actually
+                        # for Fast tokenizer we could totally avoid using SquadFeatures and SquadExample
+                        cls_index=None,
+                        token_to_orig_map={},
+                        example_index=0,
+                        unique_id=0,
+                        paragraph_len=0,
+                        token_is_max_context=0,
+                        tokens=[],
+                        start_position=0,
+                        end_position=0,
+                        is_impossible=False,
+                        qas_id=None,
+                    )
+                )
+
+        for i, feature in enumerate(features):
+            fw_args = {}
+            others = {}
+            model_input_names = self.tokenizer.model_input_names + ["p_mask", "token_type_ids"]
+
+            for k, v in feature.__dict__.items():
+                if k in model_input_names:
+                    if self.framework == "tf":
+                        tensor = tf.constant(v)
+                        if tensor.dtype == tf.int64:
+                            tensor = tf.cast(tensor, tf.int32)
+                        fw_args[k] = tf.expand_dims(tensor, 0)
+                    elif self.framework == "pt":
+                        tensor = torch.tensor(v)
+                        if tensor.dtype == torch.int32:
+                            tensor = tensor.long()
+                        fw_args[k] = tensor.unsqueeze(0)
+                else:
+                    others[k] = v
+
+            is_last = i == len(features) - 1
+            yield {"example": example, "is_last": is_last, **fw_args, **others}
+
+    def _forward(self, inputs):
+        example = inputs["example"]
+        model_inputs = {k: inputs[k] for k in self.tokenizer.model_input_names}
+        # `XXXForSequenceClassification` models should not use `use_cache=True` even if it's supported
+        model_forward = self.model.forward if self.framework == "pt" else self.model.call
+        if "use_cache" in inspect.signature(model_forward).parameters:
+            model_inputs["use_cache"] = False
+        output = self.model(**model_inputs)
+        if isinstance(output, dict):
+            return {"start": output["start_logits"], "end": output["end_logits"], "example": example, **inputs}
+        else:
+            start, end = output[:2]
+            return {"start": start, "end": end, "example": example, **inputs}
+
+    def postprocess(
+        self,
+        model_outputs,
+        top_k=1,
+        handle_impossible_answer=False,
+        max_answer_len=15,
+        align_to_words=True,
+    ):
+        min_null_score = 1000000  # large and positive
+        answers = []
+        for output in model_outputs:
+            if self.framework == "pt" and output["start"].dtype == torch.bfloat16:
+                start_ = output["start"].to(torch.float32)
+                end_ = output["end"].to(torch.float32)
+            else:
+                start_ = output["start"]
+                end_ = output["end"]
+            example = output["example"]
+            p_mask = output["p_mask"]
+            attention_mask = (
+                output["attention_mask"].numpy() if output.get("attention_mask", None) is not None else None
+            )
+
+            pre_topk = (
+                top_k * 2 + 10 if align_to_words else top_k
+            )  # Some candidates may be deleted if we align to words
+            starts, ends, scores, min_null_score = select_starts_ends(
+                start_,
+                end_,
+                p_mask,
+                attention_mask,
+                min_null_score,
+                pre_topk,
+                handle_impossible_answer,
+                max_answer_len,
+            )
+
+            if not self.tokenizer.is_fast:
+                char_to_word = np.array(example.char_to_word_offset)
+
+                # Convert the answer (tokens) back to the original text
+                # Score: score from the model
+                # Start: Index of the first character of the answer in the context string
+                # End: Index of the character following the last character of the answer in the context string
+                # Answer: Plain text of the answer
+                for s, e, score in zip(starts, ends, scores):
+                    token_to_orig_map = output["token_to_orig_map"]
+                    answers.append(
+                        {
+                            "score": score.item(),
+                            "start": np.where(char_to_word == token_to_orig_map[s])[0][0].item(),
+                            "end": np.where(char_to_word == token_to_orig_map[e])[0][-1].item(),
+                            "answer": " ".join(example.doc_tokens[token_to_orig_map[s] : token_to_orig_map[e] + 1]),
+                        }
+                    )
+            else:
+                # Convert the answer (tokens) back to the original text
+                # Score: score from the model
+                # Start: Index of the first character of the answer in the context string
+                # End: Index of the character following the last character of the answer in the context string
+                # Answer: Plain text of the answer
+                question_first = self.tokenizer.padding_side == "right"
+                enc = output["encoding"]
+
+                # Encoding was *not* padded, input_ids *might*.
+                # It doesn't make a difference unless we're padding on
+                # the left hand side, since now we have different offsets
+                # everywhere.
+                if self.tokenizer.padding_side == "left":
+                    offset = (output["input_ids"] == self.tokenizer.pad_token_id).numpy().sum()
+                else:
+                    offset = 0
+
+                # Sometimes the max probability token is in the middle of a word so:
+                # - we start by finding the right word containing the token with `token_to_word`
+                # - then we convert this word in a character span with `word_to_chars`
+                sequence_index = 1 if question_first else 0
+
+                for s, e, score in zip(starts, ends, scores):
+                    s = s - offset
+                    e = e - offset
+
+                    start_index, end_index = self.get_indices(enc, s, e, sequence_index, align_to_words)
+
+                    target_answer = example.context_text[start_index:end_index]
+                    answer = self.get_answer(answers, target_answer)
+
+                    if answer:
+                        answer["score"] += score.item()
+                    else:
+                        answers.append(
+                            {
+                                "score": score.item(),
+                                "start": start_index,
+                                "end": end_index,
+                                "answer": example.context_text[start_index:end_index],
+                            }
+                        )
+
+        if handle_impossible_answer:
+            answers.append({"score": min_null_score, "start": 0, "end": 0, "answer": ""})
+        answers = sorted(answers, key=lambda x: x["score"], reverse=True)[:top_k]
+        if len(answers) == 1:
+            return answers[0]
+        return answers
+
+    def get_answer(self, answers: list[dict], target: str) -> Optional[dict]:
+        for answer in answers:
+            if answer["answer"].lower() == target.lower():
+                return answer
+        return None
+
+    def get_indices(
+        self, enc: "tokenizers.Encoding", s: int, e: int, sequence_index: int, align_to_words: bool
+    ) -> tuple[int, int]:
+        if align_to_words:
+            try:
+                start_word = enc.token_to_word(s)
+                end_word = enc.token_to_word(e)
+                start_index = enc.word_to_chars(start_word, sequence_index=sequence_index)[0]
+                end_index = enc.word_to_chars(end_word, sequence_index=sequence_index)[1]
+            except Exception:
+                # Some tokenizers don't really handle words. Keep to offsets then.
+                start_index = enc.offsets[s][0]
+                end_index = enc.offsets[e][1]
+        else:
+            start_index = enc.offsets[s][0]
+            end_index = enc.offsets[e][1]
+        return start_index, end_index
+
+    def span_to_answer(self, text: str, start: int, end: int) -> dict[str, Union[str, int]]:
+        """
+        When decoding from token probabilities, this method maps token indexes to actual word in the initial context.
+
+        Args:
+            text (`str`): The actual context to extract the answer from.
+            start (`int`): The answer starting token index.
+            end (`int`): The answer end token index.
+
+        Returns:
+            Dictionary like `{'answer': str, 'start': int, 'end': int}`
+        """
+        words = []
+        token_idx = char_start_idx = char_end_idx = chars_idx = 0
+
+        for word in text.split(" "):
+            token = self.tokenizer.tokenize(word)
+
+            # Append words if they are in the span
+            if start <= token_idx <= end:
+                if token_idx == start:
+                    char_start_idx = chars_idx
+
+                if token_idx == end:
+                    char_end_idx = chars_idx + len(word)
+
+                words += [word]
+
+            # Stop if we went over the end of the answer
+            if token_idx > end:
+                break
+
+            # Append the subtokenization length to the running index
+            token_idx += len(token)
+            chars_idx += len(word) + 1
+
+        # Join text with spaces
+        return {
+            "answer": " ".join(words),
+            "start": max(0, char_start_idx),
+            "end": min(len(text), char_end_idx),
+        }
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/table_question_answering.py b/venv/lib/python3.13/site-packages/transformers/pipelines/table_question_answering.py
new file mode 100644
index 0000000000000000000000000000000000000000..4eba8ad64cf2e756f084a7726c89de1df3ce5cec
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/table_question_answering.py
@@ -0,0 +1,457 @@
+import collections
+import types
+
+import numpy as np
+
+from ..generation import GenerationConfig
+from ..utils import (
+    add_end_docstrings,
+    is_tf_available,
+    is_torch_available,
+    requires_backends,
+)
+from .base import ArgumentHandler, Dataset, Pipeline, PipelineException, build_pipeline_init_args
+
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import (
+        MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+        MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES,
+    )
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from ..models.auto.modeling_tf_auto import (
+        TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+        TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES,
+    )
+
+
+class TableQuestionAnsweringArgumentHandler(ArgumentHandler):
+    """
+    Handles arguments for the TableQuestionAnsweringPipeline
+    """
+
+    def __call__(self, table=None, query=None, **kwargs):
+        # Returns tqa_pipeline_inputs of shape:
+        # [
+        #   {"table": pd.DataFrame, "query": list[str]},
+        #   ...,
+        #   {"table": pd.DataFrame, "query" : list[str]}
+        # ]
+        requires_backends(self, "pandas")
+        import pandas as pd
+
+        if table is None:
+            raise ValueError("Keyword argument `table` cannot be None.")
+        elif query is None:
+            if isinstance(table, dict) and table.get("query") is not None and table.get("table") is not None:
+                tqa_pipeline_inputs = [table]
+            elif isinstance(table, list) and len(table) > 0:
+                if not all(isinstance(d, dict) for d in table):
+                    raise ValueError(
+                        f"Keyword argument `table` should be a list of dict, but is {(type(d) for d in table)}"
+                    )
+
+                if table[0].get("query") is not None and table[0].get("table") is not None:
+                    tqa_pipeline_inputs = table
+                else:
+                    raise ValueError(
+                        "If keyword argument `table` is a list of dictionaries, each dictionary should have a `table`"
+                        f" and `query` key, but only dictionary has keys {table[0].keys()} `table` and `query` keys."
+                    )
+            elif Dataset is not None and isinstance(table, Dataset) or isinstance(table, types.GeneratorType):
+                return table
+            else:
+                raise ValueError(
+                    "Invalid input. Keyword argument `table` should be either of type `dict` or `list`, but "
+                    f"is {type(table)})"
+                )
+        else:
+            tqa_pipeline_inputs = [{"table": table, "query": query}]
+
+        for tqa_pipeline_input in tqa_pipeline_inputs:
+            if not isinstance(tqa_pipeline_input["table"], pd.DataFrame):
+                if tqa_pipeline_input["table"] is None:
+                    raise ValueError("Table cannot be None.")
+
+                tqa_pipeline_input["table"] = pd.DataFrame(tqa_pipeline_input["table"])
+
+        return tqa_pipeline_inputs
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
+class TableQuestionAnsweringPipeline(Pipeline):
+    """
+    Table Question Answering pipeline using a `ModelForTableQuestionAnswering`. This pipeline is only available in
+    PyTorch.
+
+    Unless the model you're using explicitly sets these generation parameters in its configuration files
+    (`generation_config.json`), the following default values will be used:
+    - max_new_tokens: 256
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> oracle = pipeline(model="google/tapas-base-finetuned-wtq")
+    >>> table = {
+    ...     "Repository": ["Transformers", "Datasets", "Tokenizers"],
+    ...     "Stars": ["36542", "4512", "3934"],
+    ...     "Contributors": ["651", "77", "34"],
+    ...     "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
+    ... }
+    >>> oracle(query="How many stars does the transformers repository have?", table=table)
+    {'answer': 'AVERAGE > 36542', 'coordinates': [(0, 1)], 'cells': ['36542'], 'aggregator': 'AVERAGE'}
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This tabular question answering pipeline can currently be loaded from [`pipeline`] using the following task
+    identifier: `"table-question-answering"`.
+
+    The models that this pipeline can use are models that have been fine-tuned on a tabular question answering task.
+    See the up-to-date list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=table-question-answering).
+    """
+
+    default_input_names = "table,query"
+
+    _pipeline_calls_generate = True
+    _load_processor = False
+    _load_image_processor = False
+    _load_feature_extractor = False
+    _load_tokenizer = True
+    # Make sure the docstring is updated when the default generation config is changed
+    _default_generation_config = GenerationConfig(
+        max_new_tokens=256,
+    )
+
+    def __init__(self, args_parser=TableQuestionAnsweringArgumentHandler(), **kwargs):
+        super().__init__(**kwargs)
+        self._args_parser = args_parser
+
+        if self.framework == "tf":
+            mapping = TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES.copy()
+            mapping.update(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES)
+        else:
+            mapping = MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES.copy()
+            mapping.update(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES)
+        self.check_model_type(mapping)
+
+        self.aggregate = getattr(self.model.config, "aggregation_labels", None) and getattr(
+            self.model.config, "num_aggregation_labels", None
+        )
+        self.type = "tapas" if hasattr(self.model.config, "aggregation_labels") else None
+
+    def batch_inference(self, **inputs):
+        return self.model(**inputs)
+
+    def sequential_inference(self, **inputs):
+        """
+        Inference used for models that need to process sequences in a sequential fashion, like the SQA models which
+        handle conversational query related to a table.
+        """
+        if self.framework == "pt":
+            all_logits = []
+            all_aggregations = []
+            prev_answers = None
+            batch_size = inputs["input_ids"].shape[0]
+
+            input_ids = inputs["input_ids"].to(self.device)
+            attention_mask = inputs["attention_mask"].to(self.device)
+            token_type_ids = inputs["token_type_ids"].to(self.device)
+            token_type_ids_example = None
+
+            for index in range(batch_size):
+                # If sequences have already been processed, the token type IDs will be created according to the previous
+                # answer.
+                if prev_answers is not None:
+                    prev_labels_example = token_type_ids_example[:, 3]  # shape (seq_len,)
+                    model_labels = np.zeros_like(prev_labels_example.cpu().numpy())  # shape (seq_len,)
+
+                    token_type_ids_example = token_type_ids[index]  # shape (seq_len, 7)
+                    for i in range(model_labels.shape[0]):
+                        segment_id = token_type_ids_example[:, 0].tolist()[i]
+                        col_id = token_type_ids_example[:, 1].tolist()[i] - 1
+                        row_id = token_type_ids_example[:, 2].tolist()[i] - 1
+
+                        if row_id >= 0 and col_id >= 0 and segment_id == 1:
+                            model_labels[i] = int(prev_answers[(col_id, row_id)])
+
+                    token_type_ids_example[:, 3] = torch.from_numpy(model_labels).type(torch.long).to(self.device)
+
+                input_ids_example = input_ids[index]
+                attention_mask_example = attention_mask[index]  # shape (seq_len,)
+                token_type_ids_example = token_type_ids[index]  # shape (seq_len, 7)
+                outputs = self.model(
+                    input_ids=input_ids_example.unsqueeze(0),
+                    attention_mask=attention_mask_example.unsqueeze(0),
+                    token_type_ids=token_type_ids_example.unsqueeze(0),
+                )
+                logits = outputs.logits
+
+                if self.aggregate:
+                    all_aggregations.append(outputs.logits_aggregation)
+
+                all_logits.append(logits)
+
+                dist_per_token = torch.distributions.Bernoulli(logits=logits)
+                probabilities = dist_per_token.probs * attention_mask_example.type(torch.float32).to(
+                    dist_per_token.probs.device
+                )
+
+                coords_to_probs = collections.defaultdict(list)
+                for i, p in enumerate(probabilities.squeeze().tolist()):
+                    segment_id = token_type_ids_example[:, 0].tolist()[i]
+                    col = token_type_ids_example[:, 1].tolist()[i] - 1
+                    row = token_type_ids_example[:, 2].tolist()[i] - 1
+                    if col >= 0 and row >= 0 and segment_id == 1:
+                        coords_to_probs[(col, row)].append(p)
+
+                prev_answers = {key: np.array(coords_to_probs[key]).mean() > 0.5 for key in coords_to_probs}
+
+            logits_batch = torch.cat(tuple(all_logits), 0)
+
+            return (logits_batch,) if not self.aggregate else (logits_batch, torch.cat(tuple(all_aggregations), 0))
+        else:
+            all_logits = []
+            all_aggregations = []
+            prev_answers = None
+            batch_size = inputs["input_ids"].shape[0]
+
+            input_ids = inputs["input_ids"]
+            attention_mask = inputs["attention_mask"]
+            token_type_ids = inputs["token_type_ids"].numpy()
+            token_type_ids_example = None
+
+            for index in range(batch_size):
+                # If sequences have already been processed, the token type IDs will be created according to the previous
+                # answer.
+                if prev_answers is not None:
+                    prev_labels_example = token_type_ids_example[:, 3]  # shape (seq_len,)
+                    model_labels = np.zeros_like(prev_labels_example, dtype=np.int32)  # shape (seq_len,)
+
+                    token_type_ids_example = token_type_ids[index]  # shape (seq_len, 7)
+                    for i in range(model_labels.shape[0]):
+                        segment_id = token_type_ids_example[:, 0].tolist()[i]
+                        col_id = token_type_ids_example[:, 1].tolist()[i] - 1
+                        row_id = token_type_ids_example[:, 2].tolist()[i] - 1
+
+                        if row_id >= 0 and col_id >= 0 and segment_id == 1:
+                            model_labels[i] = int(prev_answers[(col_id, row_id)])
+
+                    token_type_ids_example[:, 3] = model_labels
+
+                input_ids_example = input_ids[index]
+                attention_mask_example = attention_mask[index]  # shape (seq_len,)
+                token_type_ids_example = token_type_ids[index]  # shape (seq_len, 7)
+                outputs = self.model(
+                    input_ids=np.expand_dims(input_ids_example, axis=0),
+                    attention_mask=np.expand_dims(attention_mask_example, axis=0),
+                    token_type_ids=np.expand_dims(token_type_ids_example, axis=0),
+                )
+                logits = outputs.logits
+
+                if self.aggregate:
+                    all_aggregations.append(outputs.logits_aggregation)
+
+                all_logits.append(logits)
+
+                probabilities = tf.math.sigmoid(tf.cast(logits, tf.float32)) * tf.cast(
+                    attention_mask_example, tf.float32
+                )
+
+                coords_to_probs = collections.defaultdict(list)
+                for i, p in enumerate(tf.squeeze(probabilities).numpy().tolist()):
+                    segment_id = token_type_ids_example[:, 0].tolist()[i]
+                    col = token_type_ids_example[:, 1].tolist()[i] - 1
+                    row = token_type_ids_example[:, 2].tolist()[i] - 1
+                    if col >= 0 and row >= 0 and segment_id == 1:
+                        coords_to_probs[(col, row)].append(p)
+
+                prev_answers = {key: np.array(coords_to_probs[key]).mean() > 0.5 for key in coords_to_probs}
+
+            logits_batch = tf.concat(tuple(all_logits), 0)
+
+            return (logits_batch,) if not self.aggregate else (logits_batch, tf.concat(tuple(all_aggregations), 0))
+
+    def __call__(self, *args, **kwargs):
+        r"""
+        Answers queries according to a table. The pipeline accepts several types of inputs which are detailed below:
+
+        - `pipeline(table, query)`
+        - `pipeline(table, [query])`
+        - `pipeline(table=table, query=query)`
+        - `pipeline(table=table, query=[query])`
+        - `pipeline({"table": table, "query": query})`
+        - `pipeline({"table": table, "query": [query]})`
+        - `pipeline([{"table": table, "query": query}, {"table": table, "query": query}])`
+
+        The `table` argument should be a dict or a DataFrame built from that dict, containing the whole table:
+
+        Example:
+
+        ```python
+        data = {
+            "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
+            "age": ["56", "45", "59"],
+            "number of movies": ["87", "53", "69"],
+            "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
+        }
+        ```
+
+        This dictionary can be passed in as such, or can be converted to a pandas DataFrame:
+
+        Example:
+
+        ```python
+        import pandas as pd
+
+        table = pd.DataFrame.from_dict(data)
+        ```
+
+        Args:
+            table (`pd.DataFrame` or `Dict`):
+                Pandas DataFrame or dictionary that will be converted to a DataFrame containing all the table values.
+                See above for an example of dictionary.
+            query (`str` or `list[str]`):
+                Query or list of queries that will be sent to the model alongside the table.
+            sequential (`bool`, *optional*, defaults to `False`):
+                Whether to do inference sequentially or as a batch. Batching is faster, but models like SQA require the
+                inference to be done sequentially to extract relations within sequences, given their conversational
+                nature.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Activates and controls padding. Accepts the following values:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+
+            truncation (`bool`, `str` or [`TapasTruncationStrategy`], *optional*, defaults to `False`):
+                Activates and controls truncation. Accepts the following values:
+
+                - `True` or `'drop_rows_to_fit'`: Truncate to a maximum length specified with the argument `max_length`
+                  or to the maximum acceptable input length for the model if that argument is not provided. This will
+                  truncate row by row, removing rows from the table.
+                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
+                  greater than the model maximum admissible input size).
+
+
+        Return:
+            A dictionary or a list of dictionaries containing results: Each result is a dictionary with the following
+            keys:
+
+            - **answer** (`str`) -- The answer of the query given the table. If there is an aggregator, the answer will
+              be preceded by `AGGREGATOR >`.
+            - **coordinates** (`list[tuple[int, int]]`) -- Coordinates of the cells of the answers.
+            - **cells** (`list[str]`) -- List of strings made up of the answer cell values.
+            - **aggregator** (`str`) -- If the model has an aggregator, this returns the aggregator.
+        """
+        pipeline_inputs = self._args_parser(*args, **kwargs)
+
+        results = super().__call__(pipeline_inputs, **kwargs)
+        if len(results) == 1:
+            return results[0]
+        return results
+
+    def _sanitize_parameters(self, sequential=None, padding=None, truncation=None, **kwargs):
+        preprocess_params = {}
+        if padding is not None:
+            preprocess_params["padding"] = padding
+        if truncation is not None:
+            preprocess_params["truncation"] = truncation
+
+        forward_params = {}
+        if sequential is not None:
+            forward_params["sequential"] = sequential
+
+        if getattr(self, "assistant_model", None) is not None:
+            forward_params["assistant_model"] = self.assistant_model
+        if getattr(self, "assistant_tokenizer", None) is not None:
+            forward_params["tokenizer"] = self.tokenizer
+            forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+        return preprocess_params, forward_params, {}
+
+    def preprocess(self, pipeline_input, padding=True, truncation=None):
+        if truncation is None:
+            if self.type == "tapas":
+                truncation = "drop_rows_to_fit"
+            else:
+                truncation = "do_not_truncate"
+
+        table, query = pipeline_input["table"], pipeline_input["query"]
+        if table.empty:
+            raise ValueError("table is empty")
+        if query is None or query == "":
+            raise ValueError("query is empty")
+        inputs = self.tokenizer(table, query, return_tensors=self.framework, truncation=truncation, padding=padding)
+        inputs["table"] = table
+        return inputs
+
+    def _forward(self, model_inputs, sequential=False, **generate_kwargs):
+        table = model_inputs.pop("table")
+
+        if self.type == "tapas":
+            if sequential:
+                outputs = self.sequential_inference(**model_inputs)
+            else:
+                outputs = self.batch_inference(**model_inputs)
+        else:
+            # User-defined `generation_config` passed to the pipeline call take precedence
+            if "generation_config" not in generate_kwargs:
+                generate_kwargs["generation_config"] = self.generation_config
+
+            outputs = self.model.generate(**model_inputs, **generate_kwargs)
+        model_outputs = {"model_inputs": model_inputs, "table": table, "outputs": outputs}
+        return model_outputs
+
+    def postprocess(self, model_outputs):
+        inputs = model_outputs["model_inputs"]
+        table = model_outputs["table"]
+        outputs = model_outputs["outputs"]
+        if self.type == "tapas":
+            if self.aggregate:
+                logits, logits_agg = outputs[:2]
+                predictions = self.tokenizer.convert_logits_to_predictions(inputs, logits, logits_agg)
+                answer_coordinates_batch, agg_predictions = predictions
+                aggregators = {i: self.model.config.aggregation_labels[pred] for i, pred in enumerate(agg_predictions)}
+
+                no_agg_label_index = self.model.config.no_aggregation_label_index
+                aggregators_prefix = {
+                    i: aggregators[i] + " > " for i, pred in enumerate(agg_predictions) if pred != no_agg_label_index
+                }
+            else:
+                logits = outputs[0]
+                predictions = self.tokenizer.convert_logits_to_predictions(inputs, logits)
+                answer_coordinates_batch = predictions[0]
+                aggregators = {}
+                aggregators_prefix = {}
+            answers = []
+            for index, coordinates in enumerate(answer_coordinates_batch):
+                cells = [table.iat[coordinate] for coordinate in coordinates]
+                aggregator = aggregators.get(index, "")
+                aggregator_prefix = aggregators_prefix.get(index, "")
+                answer = {
+                    "answer": aggregator_prefix + ", ".join(cells),
+                    "coordinates": coordinates,
+                    "cells": [table.iat[coordinate] for coordinate in coordinates],
+                }
+                if aggregator:
+                    answer["aggregator"] = aggregator
+
+                answers.append(answer)
+            if len(answer) == 0:
+                raise PipelineException("Table question answering", self.model.name_or_path, "Empty answer")
+        else:
+            answers = [{"answer": answer} for answer in self.tokenizer.batch_decode(outputs, skip_special_tokens=True)]
+
+        return answers if len(answers) > 1 else answers[0]
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/text2text_generation.py b/venv/lib/python3.13/site-packages/transformers/pipelines/text2text_generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..8952b58208677ec1984b163ff2a34318ff6f8267
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/text2text_generation.py
@@ -0,0 +1,410 @@
+import enum
+import warnings
+from typing import Any, Union
+
+from ..generation import GenerationConfig
+from ..tokenization_utils import TruncationStrategy
+from ..utils import add_end_docstrings, is_tf_available, is_torch_available, logging
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+class ReturnType(enum.Enum):
+    TENSORS = 0
+    TEXT = 1
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
+class Text2TextGenerationPipeline(Pipeline):
+    """
+    Pipeline for text to text generation using seq2seq models.
+
+    Unless the model you're using explicitly sets these generation parameters in its configuration files
+    (`generation_config.json`), the following default values will be used:
+    - max_new_tokens: 256
+    - num_beams: 4
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> generator = pipeline(model="mrm8488/t5-base-finetuned-question-generation-ap")
+    >>> generator(
+    ...     "answer: Manuel context: Manuel has created RuPERTa-base with the support of HF-Transformers and Google"
+    ... )
+    [{'generated_text': 'question: Who created the RuPERTa-base?'}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial). You can pass text
+    generation parameters to this pipeline to control stopping criteria, decoding strategy, and more. Learn more about
+    text generation parameters in [Text generation strategies](../generation_strategies) and [Text
+    generation](text_generation).
+
+    This Text2TextGenerationPipeline pipeline can currently be loaded from [`pipeline`] using the following task
+    identifier: `"text2text-generation"`.
+
+    The models that this pipeline can use are models that have been fine-tuned on a translation task. See the
+    up-to-date list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=text2text-generation). For a list of available
+    parameters, see the [following
+    documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.generation.GenerationMixin.generate)
+
+    Usage:
+
+    ```python
+    text2text_generator = pipeline("text2text-generation")
+    text2text_generator("question: What is 42 ? context: 42 is the answer to life, the universe and everything")
+    ```"""
+
+    _pipeline_calls_generate = True
+    _load_processor = False
+    _load_image_processor = False
+    _load_feature_extractor = False
+    _load_tokenizer = True
+    # Make sure the docstring is updated when the default generation config is changed (in all pipelines in this file)
+    _default_generation_config = GenerationConfig(
+        max_new_tokens=256,
+        num_beams=4,
+    )
+
+    # Used in the return key of the pipeline.
+    return_name = "generated"
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.check_model_type(
+            TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
+            if self.framework == "tf"
+            else MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
+        )
+
+    def _sanitize_parameters(
+        self,
+        return_tensors=None,
+        return_text=None,
+        return_type=None,
+        clean_up_tokenization_spaces=None,
+        truncation=None,
+        stop_sequence=None,
+        **generate_kwargs,
+    ):
+        preprocess_params = {}
+        if truncation is not None:
+            preprocess_params["truncation"] = truncation
+
+        forward_params = generate_kwargs
+
+        postprocess_params = {}
+        if return_tensors is not None and return_type is None:
+            return_type = ReturnType.TENSORS if return_tensors else ReturnType.TEXT
+        if return_type is not None:
+            postprocess_params["return_type"] = return_type
+
+        if clean_up_tokenization_spaces is not None:
+            postprocess_params["clean_up_tokenization_spaces"] = clean_up_tokenization_spaces
+
+        if stop_sequence is not None:
+            stop_sequence_ids = self.tokenizer.encode(stop_sequence, add_special_tokens=False)
+            if len(stop_sequence_ids) > 1:
+                warnings.warn(
+                    "Stopping on a multiple token sequence is not yet supported on transformers. The first token of"
+                    " the stop sequence will be used as the stop sequence string in the interim."
+                )
+            generate_kwargs["eos_token_id"] = stop_sequence_ids[0]
+
+        if self.assistant_model is not None:
+            forward_params["assistant_model"] = self.assistant_model
+        if self.assistant_tokenizer is not None:
+            forward_params["tokenizer"] = self.tokenizer
+            forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+        return preprocess_params, forward_params, postprocess_params
+
+    def check_inputs(self, input_length: int, min_length: int, max_length: int):
+        """
+        Checks whether there might be something wrong with given input with regard to the model.
+        """
+        return True
+
+    def _parse_and_tokenize(self, *args, truncation):
+        prefix = self.prefix if self.prefix is not None else ""
+        if isinstance(args[0], list):
+            if self.tokenizer.pad_token_id is None:
+                raise ValueError("Please make sure that the tokenizer has a pad_token_id when using a batch input")
+            args = ([prefix + arg for arg in args[0]],)
+            padding = True
+
+        elif isinstance(args[0], str):
+            args = (prefix + args[0],)
+            padding = False
+        else:
+            raise TypeError(
+                f" `args[0]`: {args[0]} have the wrong format. The should be either of type `str` or type `list`"
+            )
+        inputs = self.tokenizer(*args, padding=padding, truncation=truncation, return_tensors=self.framework)
+        # This is produced by tokenizers but is an invalid generate kwargs
+        if "token_type_ids" in inputs:
+            del inputs["token_type_ids"]
+        return inputs
+
+    def __call__(self, *args: Union[str, list[str]], **kwargs: Any) -> list[dict[str, str]]:
+        r"""
+        Generate the output text(s) using text(s) given as inputs.
+
+        Args:
+            args (`str` or `list[str]`):
+                Input text for the encoder.
+            return_tensors (`bool`, *optional*, defaults to `False`):
+                Whether or not to include the tensors of predictions (as token indices) in the outputs.
+            return_text (`bool`, *optional*, defaults to `True`):
+                Whether or not to include the decoded texts in the outputs.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+                Whether or not to clean up the potential extra spaces in the text output.
+            truncation (`TruncationStrategy`, *optional*, defaults to `TruncationStrategy.DO_NOT_TRUNCATE`):
+                The truncation strategy for the tokenization within the pipeline. `TruncationStrategy.DO_NOT_TRUNCATE`
+                (default) will never truncate, but it is sometimes desirable to truncate the input to fit the model's
+                max_length instead of throwing an error down the line.
+            generate_kwargs:
+                Additional keyword arguments to pass along to the generate method of the model (see the generate method
+                corresponding to your framework [here](./text_generation)).
+
+        Return:
+            A list or a list of list of `dict`: Each result comes as a dictionary with the following keys:
+
+            - **generated_text** (`str`, present when `return_text=True`) -- The generated text.
+            - **generated_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The token
+              ids of the generated text.
+        """
+
+        result = super().__call__(*args, **kwargs)
+        if (
+            isinstance(args[0], list)
+            and all(isinstance(el, str) for el in args[0])
+            and all(len(res) == 1 for res in result)
+        ):
+            return [res[0] for res in result]
+        return result
+
+    def preprocess(self, inputs, truncation=TruncationStrategy.DO_NOT_TRUNCATE, **kwargs):
+        inputs = self._parse_and_tokenize(inputs, truncation=truncation, **kwargs)
+        return inputs
+
+    def _forward(self, model_inputs, **generate_kwargs):
+        if self.framework == "pt":
+            in_b, input_length = model_inputs["input_ids"].shape
+        elif self.framework == "tf":
+            in_b, input_length = tf.shape(model_inputs["input_ids"]).numpy()
+
+        self.check_inputs(
+            input_length,
+            generate_kwargs.get("min_length", self.generation_config.min_length),
+            generate_kwargs.get("max_length", self.generation_config.max_length),
+        )
+
+        # User-defined `generation_config` passed to the pipeline call take precedence
+        if "generation_config" not in generate_kwargs:
+            generate_kwargs["generation_config"] = self.generation_config
+
+        output_ids = self.model.generate(**model_inputs, **generate_kwargs)
+        out_b = output_ids.shape[0]
+        if self.framework == "pt":
+            output_ids = output_ids.reshape(in_b, out_b // in_b, *output_ids.shape[1:])
+        elif self.framework == "tf":
+            output_ids = tf.reshape(output_ids, (in_b, out_b // in_b, *output_ids.shape[1:]))
+        return {"output_ids": output_ids}
+
+    def postprocess(self, model_outputs, return_type=ReturnType.TEXT, clean_up_tokenization_spaces=False):
+        records = []
+        for output_ids in model_outputs["output_ids"][0]:
+            if return_type == ReturnType.TENSORS:
+                record = {f"{self.return_name}_token_ids": output_ids}
+            elif return_type == ReturnType.TEXT:
+                record = {
+                    f"{self.return_name}_text": self.tokenizer.decode(
+                        output_ids,
+                        skip_special_tokens=True,
+                        clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+                    )
+                }
+            records.append(record)
+        return records
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
+class SummarizationPipeline(Text2TextGenerationPipeline):
+    """
+    Summarize news articles and other documents.
+
+    This summarizing pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"summarization"`.
+
+    The models that this pipeline can use are models that have been fine-tuned on a summarization task, which is
+    currently, '*bart-large-cnn*', '*google-t5/t5-small*', '*google-t5/t5-base*', '*google-t5/t5-large*', '*google-t5/t5-3b*', '*google-t5/t5-11b*'. See the up-to-date
+    list of available models on [huggingface.co/models](https://huggingface.co/models?filter=summarization). For a list
+    of available parameters, see the [following
+    documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.generation.GenerationMixin.generate)
+
+    Unless the model you're using explicitly sets these generation parameters in its configuration files
+    (`generation_config.json`), the following default values will be used:
+    - max_new_tokens: 256
+    - num_beams: 4
+
+    Usage:
+
+    ```python
+    # use bart in pytorch
+    summarizer = pipeline("summarization")
+    summarizer("An apple a day, keeps the doctor away", min_length=5, max_length=20)
+
+    # use t5 in tf
+    summarizer = pipeline("summarization", model="google-t5/t5-base", tokenizer="google-t5/t5-base", framework="tf")
+    summarizer("An apple a day, keeps the doctor away", min_length=5, max_length=20)
+    ```"""
+
+    # Used in the return key of the pipeline.
+    return_name = "summary"
+
+    def __call__(self, *args, **kwargs):
+        r"""
+        Summarize the text(s) given as inputs.
+
+        Args:
+            documents (*str* or `list[str]`):
+                One or several articles (or one list of articles) to summarize.
+            return_text (`bool`, *optional*, defaults to `True`):
+                Whether or not to include the decoded texts in the outputs
+            return_tensors (`bool`, *optional*, defaults to `False`):
+                Whether or not to include the tensors of predictions (as token indices) in the outputs.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+                Whether or not to clean up the potential extra spaces in the text output.
+            generate_kwargs:
+                Additional keyword arguments to pass along to the generate method of the model (see the generate method
+                corresponding to your framework [here](./text_generation)).
+
+        Return:
+            A list or a list of list of `dict`: Each result comes as a dictionary with the following keys:
+
+            - **summary_text** (`str`, present when `return_text=True`) -- The summary of the corresponding input.
+            - **summary_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The token
+              ids of the summary.
+        """
+        return super().__call__(*args, **kwargs)
+
+    def check_inputs(self, input_length: int, min_length: int, max_length: int) -> bool:
+        """
+        Checks whether there might be something wrong with given input with regard to the model.
+        """
+        if max_length < min_length:
+            logger.warning(f"Your min_length={min_length} must be inferior than your max_length={max_length}.")
+
+        if input_length < max_length:
+            logger.warning(
+                f"Your max_length is set to {max_length}, but your input_length is only {input_length}. Since this is "
+                "a summarization task, where outputs shorter than the input are typically wanted, you might "
+                f"consider decreasing max_length manually, e.g. summarizer('...', max_length={input_length // 2})"
+            )
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
+class TranslationPipeline(Text2TextGenerationPipeline):
+    """
+    Translates from one language to another.
+
+    This translation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"translation_xx_to_yy"`.
+
+    The models that this pipeline can use are models that have been fine-tuned on a translation task. See the
+    up-to-date list of available models on [huggingface.co/models](https://huggingface.co/models?filter=translation).
+    For a list of available parameters, see the [following
+    documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.generation.GenerationMixin.generate)
+
+    Unless the model you're using explicitly sets these generation parameters in its configuration files
+    (`generation_config.json`), the following default values will be used:
+    - max_new_tokens: 256
+    - num_beams: 4
+
+    Usage:
+
+    ```python
+    en_fr_translator = pipeline("translation_en_to_fr")
+    en_fr_translator("How old are you?")
+    ```"""
+
+    # Used in the return key of the pipeline.
+    return_name = "translation"
+
+    def check_inputs(self, input_length: int, min_length: int, max_length: int):
+        if input_length > 0.9 * max_length:
+            logger.warning(
+                f"Your input_length: {input_length} is bigger than 0.9 * max_length: {max_length}. You might consider "
+                "increasing your max_length manually, e.g. translator('...', max_length=400)"
+            )
+        return True
+
+    def preprocess(self, *args, truncation=TruncationStrategy.DO_NOT_TRUNCATE, src_lang=None, tgt_lang=None):
+        if getattr(self.tokenizer, "_build_translation_inputs", None):
+            return self.tokenizer._build_translation_inputs(
+                *args, return_tensors=self.framework, truncation=truncation, src_lang=src_lang, tgt_lang=tgt_lang
+            )
+        else:
+            return super()._parse_and_tokenize(*args, truncation=truncation)
+
+    def _sanitize_parameters(self, src_lang=None, tgt_lang=None, **kwargs):
+        preprocess_params, forward_params, postprocess_params = super()._sanitize_parameters(**kwargs)
+        if src_lang is not None:
+            preprocess_params["src_lang"] = src_lang
+        if tgt_lang is not None:
+            preprocess_params["tgt_lang"] = tgt_lang
+        if src_lang is None and tgt_lang is None:
+            # Backward compatibility, direct arguments use is preferred.
+            task = kwargs.get("task", self.task)
+            items = task.split("_")
+            if task and len(items) == 4:
+                # translation, XX, to YY
+                preprocess_params["src_lang"] = items[1]
+                preprocess_params["tgt_lang"] = items[3]
+        return preprocess_params, forward_params, postprocess_params
+
+    def __call__(self, *args, **kwargs):
+        r"""
+        Translate the text(s) given as inputs.
+
+        Args:
+            args (`str` or `list[str]`):
+                Texts to be translated.
+            return_tensors (`bool`, *optional*, defaults to `False`):
+                Whether or not to include the tensors of predictions (as token indices) in the outputs.
+            return_text (`bool`, *optional*, defaults to `True`):
+                Whether or not to include the decoded texts in the outputs.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+                Whether or not to clean up the potential extra spaces in the text output.
+            src_lang (`str`, *optional*):
+                The language of the input. Might be required for multilingual models. Will not have any effect for
+                single pair translation models
+            tgt_lang (`str`, *optional*):
+                The language of the desired output. Might be required for multilingual models. Will not have any effect
+                for single pair translation models
+            generate_kwargs:
+                Additional keyword arguments to pass along to the generate method of the model (see the generate method
+                corresponding to your framework [here](./text_generation)).
+
+        Return:
+            A list or a list of list of `dict`: Each result comes as a dictionary with the following keys:
+
+            - **translation_text** (`str`, present when `return_text=True`) -- The translation.
+            - **translation_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The
+              token ids of the translation.
+        """
+        return super().__call__(*args, **kwargs)
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/text_classification.py b/venv/lib/python3.13/site-packages/transformers/pipelines/text_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f11f3bc97418be6106b183a1925245a273cb085
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/text_classification.py
@@ -0,0 +1,245 @@
+import inspect
+import warnings
+from typing import Any, Union
+
+import numpy as np
+
+from ..utils import ExplicitEnum, add_end_docstrings, is_tf_available, is_torch_available
+from .base import GenericTensor, Pipeline, build_pipeline_init_args
+
+
+if is_tf_available():
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
+
+
+def sigmoid(_outputs):
+    return 1.0 / (1.0 + np.exp(-_outputs))
+
+
+def softmax(_outputs):
+    maxes = np.max(_outputs, axis=-1, keepdims=True)
+    shifted_exp = np.exp(_outputs - maxes)
+    return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
+
+
+class ClassificationFunction(ExplicitEnum):
+    SIGMOID = "sigmoid"
+    SOFTMAX = "softmax"
+    NONE = "none"
+
+
+@add_end_docstrings(
+    build_pipeline_init_args(has_tokenizer=True),
+    r"""
+        return_all_scores (`bool`, *optional*, defaults to `False`):
+            Whether to return all prediction scores or just the one of the predicted class.
+        function_to_apply (`str`, *optional*, defaults to `"default"`):
+            The function to apply to the model outputs in order to retrieve the scores. Accepts four different values:
+
+            - `"default"`: if the model has a single label, will apply the sigmoid function on the output. If the model
+              has several labels, will apply the softmax function on the output. In case of regression tasks, will not
+              apply any function on the output.
+            - `"sigmoid"`: Applies the sigmoid function on the output.
+            - `"softmax"`: Applies the softmax function on the output.
+            - `"none"`: Does not apply any function on the output.""",
+)
+class TextClassificationPipeline(Pipeline):
+    """
+    Text classification pipeline using any `ModelForSequenceClassification`. See the [sequence classification
+    examples](../task_summary#sequence-classification) for more information.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> classifier = pipeline(model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")
+    >>> classifier("This movie is disgustingly good !")
+    [{'label': 'POSITIVE', 'score': 1.0}]
+
+    >>> classifier("Director tried too much.")
+    [{'label': 'NEGATIVE', 'score': 0.996}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This text classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"sentiment-analysis"` (for classifying sequences according to positive or negative sentiments).
+
+    If multiple classification labels are available (`model.config.num_labels >= 2`), the pipeline will run a softmax
+    over the results. If there is a single label, the pipeline will run a sigmoid over the result. In case of regression
+    tasks (`model.config.problem_type == "regression"`), will not apply any function on the output.
+
+    The models that this pipeline can use are models that have been fine-tuned on a sequence classification task. See
+    the up-to-date list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=text-classification).
+    """
+
+    _load_processor = False
+    _load_image_processor = False
+    _load_feature_extractor = False
+    _load_tokenizer = True
+
+    return_all_scores = False
+    function_to_apply = ClassificationFunction.NONE
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        self.check_model_type(
+            TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
+            if self.framework == "tf"
+            else MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
+        )
+
+    def _sanitize_parameters(self, return_all_scores=None, function_to_apply=None, top_k="", **tokenizer_kwargs):
+        # Using "" as default argument because we're going to use `top_k=None` in user code to declare
+        # "No top_k"
+        preprocess_params = tokenizer_kwargs
+
+        postprocess_params = {}
+        if hasattr(self.model.config, "return_all_scores") and return_all_scores is None:
+            return_all_scores = self.model.config.return_all_scores
+
+        if isinstance(top_k, int) or top_k is None:
+            postprocess_params["top_k"] = top_k
+            postprocess_params["_legacy"] = False
+        elif return_all_scores is not None:
+            warnings.warn(
+                "`return_all_scores` is now deprecated,  if want a similar functionality use `top_k=None` instead of"
+                " `return_all_scores=True` or `top_k=1` instead of `return_all_scores=False`.",
+                UserWarning,
+            )
+            if return_all_scores:
+                postprocess_params["top_k"] = None
+            else:
+                postprocess_params["top_k"] = 1
+
+        if isinstance(function_to_apply, str):
+            function_to_apply = ClassificationFunction[function_to_apply.upper()]
+
+        if function_to_apply is not None:
+            postprocess_params["function_to_apply"] = function_to_apply
+        return preprocess_params, {}, postprocess_params
+
+    def __call__(
+        self,
+        inputs: Union[str, list[str], dict[str, str], list[dict[str, str]]],
+        **kwargs: Any,
+    ) -> list[dict[str, Any]]:
+        """
+        Classify the text(s) given as inputs.
+
+        Args:
+            inputs (`str` or `list[str]` or `dict[str]`, or `list[dict[str]]`):
+                One or several texts to classify. In order to use text pairs for your classification, you can send a
+                dictionary containing `{"text", "text_pair"}` keys, or a list of those.
+            top_k (`int`, *optional*, defaults to `1`):
+                How many results to return.
+            function_to_apply (`str`, *optional*, defaults to `"default"`):
+                The function to apply to the model outputs in order to retrieve the scores. Accepts four different
+                values:
+
+                If this argument is not specified, then it will apply the following functions according to the number
+                of labels:
+
+                - If problem type is regression, will not apply any function on the output.
+                - If the model has a single label, will apply the sigmoid function on the output.
+                - If the model has several labels, will apply the softmax function on the output.
+
+                Possible values are:
+
+                - `"sigmoid"`: Applies the sigmoid function on the output.
+                - `"softmax"`: Applies the softmax function on the output.
+                - `"none"`: Does not apply any function on the output.
+
+        Return:
+            A list of `dict`: Each result comes as list of dictionaries with the following keys:
+
+            - **label** (`str`) -- The label predicted.
+            - **score** (`float`) -- The corresponding probability.
+
+            If `top_k` is used, one such dictionary is returned per label.
+        """
+        inputs = (inputs,)
+        result = super().__call__(*inputs, **kwargs)
+        # TODO try and retrieve it in a nicer way from _sanitize_parameters.
+        _legacy = "top_k" not in kwargs
+        if isinstance(inputs[0], str) and _legacy:
+            # This pipeline is odd, and return a list when single item is run
+            return [result]
+        else:
+            return result
+
+    def preprocess(self, inputs, **tokenizer_kwargs) -> dict[str, GenericTensor]:
+        return_tensors = self.framework
+        if isinstance(inputs, dict):
+            return self.tokenizer(**inputs, return_tensors=return_tensors, **tokenizer_kwargs)
+        elif isinstance(inputs, list) and len(inputs) == 1 and isinstance(inputs[0], list) and len(inputs[0]) == 2:
+            # It used to be valid to use a list of list of list for text pairs, keeping this path for BC
+            return self.tokenizer(
+                text=inputs[0][0], text_pair=inputs[0][1], return_tensors=return_tensors, **tokenizer_kwargs
+            )
+        elif isinstance(inputs, list):
+            # This is likely an invalid usage of the pipeline attempting to pass text pairs.
+            raise ValueError(
+                "The pipeline received invalid inputs, if you are trying to send text pairs, you can try to send a"
+                ' dictionary `{"text": "My text", "text_pair": "My pair"}` in order to send a text pair.'
+            )
+        return self.tokenizer(inputs, return_tensors=return_tensors, **tokenizer_kwargs)
+
+    def _forward(self, model_inputs):
+        # `XXXForSequenceClassification` models should not use `use_cache=True` even if it's supported
+        model_forward = self.model.forward if self.framework == "pt" else self.model.call
+        if "use_cache" in inspect.signature(model_forward).parameters:
+            model_inputs["use_cache"] = False
+        return self.model(**model_inputs)
+
+    def postprocess(self, model_outputs, function_to_apply=None, top_k=1, _legacy=True):
+        # `_legacy` is used to determine if we're running the naked pipeline and in backward
+        # compatibility mode, or if running the pipeline with `pipeline(..., top_k=1)` we're running
+        # the more natural result containing the list.
+        # Default value before `set_parameters`
+        if function_to_apply is None:
+            if self.model.config.problem_type == "regression":
+                function_to_apply = ClassificationFunction.NONE
+            elif self.model.config.problem_type == "multi_label_classification" or self.model.config.num_labels == 1:
+                function_to_apply = ClassificationFunction.SIGMOID
+            elif self.model.config.problem_type == "single_label_classification" or self.model.config.num_labels > 1:
+                function_to_apply = ClassificationFunction.SOFTMAX
+            elif hasattr(self.model.config, "function_to_apply") and function_to_apply is None:
+                function_to_apply = self.model.config.function_to_apply
+            else:
+                function_to_apply = ClassificationFunction.NONE
+
+        outputs = model_outputs["logits"][0]
+
+        if self.framework == "pt":
+            # To enable using fp16 and bf16
+            outputs = outputs.float().numpy()
+        else:
+            outputs = outputs.numpy()
+
+        if function_to_apply == ClassificationFunction.SIGMOID:
+            scores = sigmoid(outputs)
+        elif function_to_apply == ClassificationFunction.SOFTMAX:
+            scores = softmax(outputs)
+        elif function_to_apply == ClassificationFunction.NONE:
+            scores = outputs
+        else:
+            raise ValueError(f"Unrecognized `function_to_apply` argument: {function_to_apply}")
+
+        if top_k == 1 and _legacy:
+            return {"label": self.model.config.id2label[scores.argmax().item()], "score": scores.max().item()}
+
+        dict_scores = [
+            {"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(scores)
+        ]
+        if not _legacy:
+            dict_scores.sort(key=lambda x: x["score"], reverse=True)
+            if top_k is not None:
+                dict_scores = dict_scores[:top_k]
+        return dict_scores
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/text_generation.py b/venv/lib/python3.13/site-packages/transformers/pipelines/text_generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..45ec58b702a21fc207a98e75f0b4a89e10e381a0
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/text_generation.py
@@ -0,0 +1,546 @@
+import enum
+import itertools
+import types
+from typing import Any, overload
+
+from ..generation import GenerationConfig
+from ..utils import ModelOutput, add_end_docstrings, is_tf_available, is_torch_available
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+    from .pt_utils import KeyDataset
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+
+ChatType = list[dict[str, str]]
+
+
+class ReturnType(enum.Enum):
+    TENSORS = 0
+    NEW_TEXT = 1
+    FULL_TEXT = 2
+
+
+class Chat:
+    """This class is intended to just be used internally in this pipeline and not exposed to users. We convert chats
+    to this format because the rest of the pipeline code tends to assume that lists of messages are
+    actually a batch of samples rather than messages in the same conversation."""
+
+    def __init__(self, messages: dict):
+        for message in messages:
+            if not ("role" in message and "content" in message):
+                raise ValueError("When passing chat dicts as input, each dict must have a 'role' and 'content' key.")
+        self.messages = messages
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
+class TextGenerationPipeline(Pipeline):
+    """
+    Language generation pipeline using any `ModelWithLMHead` or `ModelForCausalLM`. This pipeline predicts the words
+    that will follow a specified text prompt. When the underlying model is a conversational model, it can also accept
+    one or more chats, in which case the pipeline will operate in chat mode and will continue the chat(s) by adding
+    its response(s). Each chat takes the form of a list of dicts, where each dict contains "role" and "content" keys.
+
+    Unless the model you're using explicitly sets these generation parameters in its configuration files
+    (`generation_config.json`), the following default values will be used:
+    - max_new_tokens: 256
+    - do_sample: True
+    - temperature: 0.7
+
+    Examples:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> generator = pipeline(model="openai-community/gpt2")
+    >>> generator("I can't believe you did such a ", do_sample=False)
+    [{'generated_text': "I can't believe you did such a icky thing to me. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I"}]
+
+    >>> # These parameters will return suggestions, and only the newly created text making it easier for prompting suggestions.
+    >>> outputs = generator("My tart needs some", num_return_sequences=4, return_full_text=False)
+    ```
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> generator = pipeline(model="HuggingFaceH4/zephyr-7b-beta")
+    >>> # Zephyr-beta is a conversational model, so let's pass it a chat instead of a single string
+    >>> generator([{"role": "user", "content": "What is the capital of France? Answer in one word."}], do_sample=False, max_new_tokens=2)
+    [{'generated_text': [{'role': 'user', 'content': 'What is the capital of France? Answer in one word.'}, {'role': 'assistant', 'content': 'Paris'}]}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial). You can pass text
+    generation parameters to this pipeline to control stopping criteria, decoding strategy, and more. Learn more about
+    text generation parameters in [Text generation strategies](../generation_strategies) and [Text
+    generation](text_generation).
+
+    This language generation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"text-generation"`.
+
+    The models that this pipeline can use are models that have been trained with an autoregressive language modeling
+    objective. See the list of available [text completion models](https://huggingface.co/models?filter=text-generation)
+    and the list of [conversational models](https://huggingface.co/models?other=conversational)
+    on [huggingface.co/models].
+    """
+
+    # Prefix text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
+    # in https://github.com/rusiaaman/XLNet-gen#methodology
+    # and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e
+
+    XL_PREFIX = """
+    In 1991, the remains of Russian Tsar Nicholas II and his family (except for Alexei and Maria) are discovered. The
+    voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the remainder of the story. 1883 Western
+    Siberia, a young Grigori Rasputin is asked by his father and a group of men to perform magic. Rasputin has a vision
+    and denounces one of the men as a horse thief. Although his father initially slaps him for making such an
+    accusation, Rasputin watches as the man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
+    the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous, with people, even a bishop,
+    begging for his blessing. <eod> </s> <eos>
+    """
+
+    _pipeline_calls_generate = True
+    _load_processor = False
+    _load_image_processor = False
+    _load_feature_extractor = False
+    _load_tokenizer = True
+
+    # Make sure the docstring is updated when the default generation config is changed
+    _default_generation_config = GenerationConfig(
+        max_new_tokens=256,
+        do_sample=True,  # free-form text generation often uses sampling
+        temperature=0.7,
+    )
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.check_model_type(
+            TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES if self.framework == "tf" else MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+        )
+        if "prefix" not in self._preprocess_params:
+            # This is very specific. The logic is quite complex and needs to be done
+            # as a "default".
+            # It also defines both some preprocess_kwargs and generate_kwargs
+            # which is why we cannot put them in their respective methods.
+            prefix = None
+            if self.prefix is not None:
+                prefix = self.prefix
+            if prefix is None and self.model.__class__.__name__ in [
+                "XLNetLMHeadModel",
+                "TransfoXLLMHeadModel",
+                "TFXLNetLMHeadModel",
+                "TFTransfoXLLMHeadModel",
+            ]:
+                # For XLNet and TransformerXL we add an article to the prompt to give more state to the model.
+                prefix = self.XL_PREFIX
+            if prefix is not None:
+                # Recalculate some generate_kwargs linked to prefix.
+                preprocess_params, forward_params, _ = self._sanitize_parameters(prefix=prefix, **self._forward_params)
+                self._preprocess_params = {**self._preprocess_params, **preprocess_params}
+                self._forward_params = {**self._forward_params, **forward_params}
+
+    def _sanitize_parameters(
+        self,
+        return_full_text=None,
+        return_tensors=None,
+        return_text=None,
+        return_type=None,
+        clean_up_tokenization_spaces=None,
+        prefix=None,
+        handle_long_generation=None,
+        stop_sequence=None,
+        truncation=None,
+        max_length=None,
+        continue_final_message=None,
+        skip_special_tokens=None,
+        tokenizer_encode_kwargs=None,
+        **generate_kwargs,
+    ):
+        # preprocess kwargs
+        preprocess_params = {}
+        add_special_tokens = False
+        if "add_special_tokens" in generate_kwargs:
+            add_special_tokens = preprocess_params["add_special_tokens"] = generate_kwargs.pop("add_special_tokens")
+
+        if "padding" in generate_kwargs:
+            preprocess_params["padding"] = generate_kwargs.pop("padding")
+
+        if truncation is not None:
+            preprocess_params["truncation"] = truncation
+
+        if max_length is not None:
+            preprocess_params["max_length"] = max_length
+            generate_kwargs["max_length"] = max_length
+
+        if prefix is not None:
+            preprocess_params["prefix"] = prefix
+        if prefix:
+            prefix_inputs = self.tokenizer(
+                prefix, padding=False, add_special_tokens=add_special_tokens, return_tensors=self.framework
+            )
+            generate_kwargs["prefix_length"] = prefix_inputs["input_ids"].shape[-1]
+
+        if handle_long_generation is not None:
+            if handle_long_generation != "hole":
+                raise ValueError(
+                    f"{handle_long_generation} is not a valid value for `handle_long_generation` parameter expected"
+                    " [None, 'hole']"
+                )
+            preprocess_params["handle_long_generation"] = handle_long_generation
+
+        if continue_final_message is not None:
+            preprocess_params["continue_final_message"] = continue_final_message
+
+        if tokenizer_encode_kwargs is not None:
+            preprocess_params["tokenizer_encode_kwargs"] = tokenizer_encode_kwargs
+
+        preprocess_params.update(generate_kwargs)
+
+        # forward kwargs
+        if stop_sequence is not None:
+            stop_sequence_ids = self.tokenizer.encode(stop_sequence, add_special_tokens=False)
+            generate_kwargs["eos_token_id"] = stop_sequence_ids
+        forward_params = generate_kwargs
+        if self.assistant_model is not None:
+            forward_params["assistant_model"] = self.assistant_model
+        if self.assistant_tokenizer is not None:
+            forward_params["tokenizer"] = self.tokenizer
+            forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+        # postprocess kwargs
+        postprocess_params = {}
+        if return_full_text is not None and return_type is None:
+            if return_text is not None:
+                raise ValueError("`return_text` is mutually exclusive with `return_full_text`")
+            if return_tensors is not None:
+                raise ValueError("`return_full_text` is mutually exclusive with `return_tensors`")
+            return_type = ReturnType.FULL_TEXT if return_full_text else ReturnType.NEW_TEXT
+        if return_tensors is not None and return_type is None:
+            if return_text is not None:
+                raise ValueError("`return_text` is mutually exclusive with `return_tensors`")
+            return_type = ReturnType.TENSORS
+        if return_type is not None:
+            postprocess_params["return_type"] = return_type
+        if clean_up_tokenization_spaces is not None:
+            postprocess_params["clean_up_tokenization_spaces"] = clean_up_tokenization_spaces
+        if continue_final_message is not None:
+            postprocess_params["continue_final_message"] = continue_final_message
+        if skip_special_tokens is not None:
+            postprocess_params["skip_special_tokens"] = skip_special_tokens
+
+        return preprocess_params, forward_params, postprocess_params
+
+    # overriding _parse_and_tokenize to allow for unusual language-modeling tokenizer arguments
+    def _parse_and_tokenize(self, *args, **kwargs):
+        """
+        Parse arguments and tokenize
+        """
+        # Parse arguments
+        if self.model.__class__.__name__ == "TransfoXLLMHeadModel":
+            kwargs.update({"add_space_before_punct_symbol": True})
+
+        return super()._parse_and_tokenize(*args, **kwargs)
+
+    @overload
+    def __call__(self, text_inputs: str, **kwargs: Any) -> list[dict[str, str]]: ...
+
+    @overload
+    def __call__(self, text_inputs: list[str], **kwargs: Any) -> list[list[dict[str, str]]]: ...
+
+    @overload
+    def __call__(self, text_inputs: ChatType, **kwargs: Any) -> list[dict[str, ChatType]]: ...
+
+    @overload
+    def __call__(self, text_inputs: list[ChatType], **kwargs: Any) -> list[list[dict[str, ChatType]]]: ...
+
+    def __call__(self, text_inputs, **kwargs):
+        """
+        Complete the prompt(s) given as inputs.
+
+        Args:
+            text_inputs (`str`, `list[str]`, list[dict[str, str]], or `list[list[dict[str, str]]]`):
+                One or several prompts (or one list of prompts) to complete. If strings or a list of string are
+                passed, this pipeline will continue each prompt. Alternatively, a "chat", in the form of a list
+                of dicts with "role" and "content" keys, can be passed, or a list of such chats. When chats are passed,
+                the model's chat template will be used to format them before passing them to the model.
+            return_tensors (`bool`, *optional*, defaults to `False`):
+                Returns the tensors of predictions (as token indices) in the outputs. If set to
+                `True`, the decoded text is not returned.
+            return_text (`bool`, *optional*):
+                Returns the decoded texts in the outputs.
+            return_full_text (`bool`, *optional*, defaults to `True`):
+                If set to `False` only added text is returned, otherwise the full text is returned. Cannot be
+                specified at the same time as `return_text`.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean up the potential extra spaces in the text output.
+            continue_final_message( `bool`, *optional*): This indicates that you want the model to continue the
+                last message in the input chat rather than starting a new one, allowing you to "prefill" its response.
+                By default this is `True` when the final message in the input chat has the `assistant` role and
+                `False` otherwise, but you can manually override that behaviour by setting this flag.
+            prefix (`str`, *optional*):
+                Prefix added to prompt.
+            handle_long_generation (`str`, *optional*):
+                By default, this pipelines does not handle long generation (ones that exceed in one form or the other
+                the model maximum length). There is no perfect way to address this (more info
+                :https://github.com/huggingface/transformers/issues/14033#issuecomment-948385227). This provides common
+                strategies to work around that problem depending on your use case.
+
+                - `None` : default strategy where nothing in particular happens
+                - `"hole"`: Truncates left of input, and leaves a gap wide enough to let generation happen (might
+                  truncate a lot of the prompt and not suitable when generation exceed the model capacity)
+            tokenizer_encode_kwargs (`dict`, *optional*):
+                Additional keyword arguments to pass along to the encoding step of the tokenizer. If the text input is
+                a chat, it is passed to `apply_chat_template`. Otherwise, it is passed to `__call__`.
+            generate_kwargs (`dict`, *optional*):
+                Additional keyword arguments to pass along to the generate method of the model (see the generate method
+                corresponding to your framework [here](./text_generation)).
+
+        Return:
+            A list or a list of lists of `dict`: Returns one of the following dictionaries (cannot return a combination
+            of both `generated_text` and `generated_token_ids`):
+
+            - **generated_text** (`str`, present when `return_text=True`) -- The generated text.
+            - **generated_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The token
+              ids of the generated text.
+        """
+        if isinstance(
+            text_inputs,
+            (list, tuple, types.GeneratorType, KeyDataset)
+            if is_torch_available()
+            else (list, tuple, types.GeneratorType),
+        ):
+            if isinstance(text_inputs, types.GeneratorType):
+                text_inputs, _ = itertools.tee(text_inputs)
+                text_inputs, first_item = (x for x in text_inputs), next(_)
+            else:
+                first_item = text_inputs[0]
+            if isinstance(first_item, (list, tuple, dict)):
+                # We have one or more prompts in list-of-dicts format, so this is chat mode
+                if isinstance(first_item, dict):
+                    return super().__call__(Chat(text_inputs), **kwargs)
+                else:
+                    chats = (Chat(chat) for chat in text_inputs)  # 🐈 🐈 🐈
+                    if isinstance(text_inputs, types.GeneratorType):
+                        return super().__call__(chats, **kwargs)
+                    else:
+                        return super().__call__(list(chats), **kwargs)
+        return super().__call__(text_inputs, **kwargs)
+
+    def preprocess(
+        self,
+        prompt_text,
+        prefix="",
+        handle_long_generation=None,
+        add_special_tokens=None,
+        truncation=None,
+        padding=None,
+        max_length=None,
+        continue_final_message=None,
+        tokenizer_encode_kwargs=None,
+        **generate_kwargs,
+    ):
+        # Only set non-None tokenizer kwargs, so as to rely on the tokenizer's defaults
+        tokenizer_kwargs = {
+            "add_special_tokens": add_special_tokens,
+            "truncation": truncation,
+            "padding": padding,
+            "max_length": max_length,  # NOTE: `max_length` is also a `generate` arg. Use `tokenizer_encode_kwargs` to avoid a name clash
+        }
+        tokenizer_kwargs = {key: value for key, value in tokenizer_kwargs.items() if value is not None}
+        tokenizer_kwargs.update(tokenizer_encode_kwargs or {})
+
+        if isinstance(prompt_text, Chat):
+            tokenizer_kwargs.pop("add_special_tokens", None)  # ignore add_special_tokens on chats
+            # If the user passes a chat that ends in an assistant message, we treat it as a prefill by default
+            # because very few models support multiple separate, consecutive assistant messages
+            if continue_final_message is None:
+                continue_final_message = prompt_text.messages[-1]["role"] == "assistant"
+            inputs = self.tokenizer.apply_chat_template(
+                prompt_text.messages,
+                add_generation_prompt=not continue_final_message,
+                continue_final_message=continue_final_message,
+                return_dict=True,
+                return_tensors=self.framework,
+                **tokenizer_kwargs,
+            )
+        else:
+            inputs = self.tokenizer(prefix + prompt_text, return_tensors=self.framework, **tokenizer_kwargs)
+
+        inputs["prompt_text"] = prompt_text
+
+        if handle_long_generation == "hole":
+            cur_len = inputs["input_ids"].shape[-1]
+            if "max_new_tokens" in generate_kwargs:
+                new_tokens = generate_kwargs["max_new_tokens"]
+            else:
+                new_tokens = generate_kwargs.get("max_length", self.generation_config.max_length) - cur_len
+                if new_tokens < 0:
+                    raise ValueError("We cannot infer how many new tokens are expected")
+            if cur_len + new_tokens > self.tokenizer.model_max_length:
+                keep_length = self.tokenizer.model_max_length - new_tokens
+                if keep_length <= 0:
+                    raise ValueError(
+                        "We cannot use `hole` to handle this generation the number of desired tokens exceeds the"
+                        " models max length"
+                    )
+
+                inputs["input_ids"] = inputs["input_ids"][:, -keep_length:]
+                if "attention_mask" in inputs:
+                    inputs["attention_mask"] = inputs["attention_mask"][:, -keep_length:]
+
+        return inputs
+
+    def _forward(self, model_inputs, **generate_kwargs):
+        input_ids = model_inputs["input_ids"]
+        attention_mask = model_inputs.get("attention_mask", None)
+        # Allow empty prompts
+        if input_ids.shape[1] == 0:
+            input_ids = None
+            attention_mask = None
+            in_b = 1
+        else:
+            in_b = input_ids.shape[0]
+        prompt_text = model_inputs.pop("prompt_text")
+
+        # If there is a prefix, we may need to adjust the generation length. Do so without permanently modifying
+        # generate_kwargs, as some of the parameterization may come from the initialization of the pipeline.
+        prefix_length = generate_kwargs.pop("prefix_length", 0)
+        if prefix_length > 0:
+            has_max_new_tokens = "max_new_tokens" in generate_kwargs or (
+                "generation_config" in generate_kwargs
+                and generate_kwargs["generation_config"].max_new_tokens is not None
+            )
+            if not has_max_new_tokens:
+                generate_kwargs["max_length"] = generate_kwargs.get("max_length") or self.generation_config.max_length
+                generate_kwargs["max_length"] += prefix_length
+            has_min_new_tokens = "min_new_tokens" in generate_kwargs or (
+                "generation_config" in generate_kwargs
+                and generate_kwargs["generation_config"].min_new_tokens is not None
+            )
+            if not has_min_new_tokens and "min_length" in generate_kwargs:
+                generate_kwargs["min_length"] += prefix_length
+
+        # User-defined `generation_config` passed to the pipeline call take precedence
+        if "generation_config" not in generate_kwargs:
+            generate_kwargs["generation_config"] = self.generation_config
+
+        output = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, **generate_kwargs)
+
+        if isinstance(output, ModelOutput):
+            generated_sequence = output.sequences
+            other_outputs = {k: v for k, v in output.items() if k not in {"sequences", "past_key_values"}}
+            out_b = generated_sequence.shape[0]
+
+            if self.framework == "pt":
+                for key, value in other_outputs.items():
+                    if isinstance(value, torch.Tensor) and value.shape[0] == out_b:
+                        other_outputs[key] = value.reshape(in_b, out_b // in_b, *value.shape[1:])
+                    if isinstance(value, tuple) and len(value[0]) == out_b:
+                        value = torch.stack(value).swapaxes(0, 1)
+                        other_outputs[key] = value
+            elif self.framework == "tf":
+                for key, value in other_outputs.items():
+                    if isinstance(value, tf.Tensor) and value.shape[0] == out_b:
+                        other_outputs[key] = tf.reshape(value, (in_b, out_b // in_b, *value.shape[1:]))
+                    if isinstance(value, tuple) and len(value[0]) == out_b:
+                        value = tf.stack(value).swapaxes(0, 1)
+                        other_outputs[key] = value
+        else:
+            generated_sequence = output
+            other_outputs = {}
+
+        out_b = generated_sequence.shape[0]
+        if self.framework == "pt":
+            generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
+        elif self.framework == "tf":
+            generated_sequence = tf.reshape(generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:]))
+
+        model_outputs = {
+            "generated_sequence": generated_sequence,
+            "input_ids": input_ids,
+            "prompt_text": prompt_text,
+        }
+        if other_outputs:
+            model_outputs.update({"additional_outputs": other_outputs})
+        return model_outputs
+
+    def postprocess(
+        self,
+        model_outputs,
+        return_type=ReturnType.FULL_TEXT,
+        clean_up_tokenization_spaces=True,
+        continue_final_message=None,
+        skip_special_tokens=None,
+    ):
+        generated_sequence = model_outputs["generated_sequence"][0]
+        input_ids = model_outputs["input_ids"]
+        prompt_text = model_outputs["prompt_text"]
+        generated_sequence = generated_sequence.numpy().tolist()
+        records = []
+        other_outputs = model_outputs.get("additional_outputs", {})
+        split_keys = {}
+        if other_outputs:
+            if self.framework == "pt":
+                for k, v in other_outputs.items():
+                    if isinstance(v, torch.Tensor) and v.shape[0] == len(generated_sequence):
+                        split_keys[k] = v.numpy().tolist()
+            elif self.framework == "tf":
+                for k, v in other_outputs.items():
+                    if isinstance(v, tf.Tensor) and v.shape[0] == len(generated_sequence):
+                        split_keys[k] = v.numpy().tolist()
+
+        skip_special_tokens = skip_special_tokens if skip_special_tokens is not None else True
+        for idx, sequence in enumerate(generated_sequence):
+            if return_type == ReturnType.TENSORS:
+                record = {"generated_token_ids": sequence}
+            elif return_type in {ReturnType.NEW_TEXT, ReturnType.FULL_TEXT}:
+                # Decode text
+                text = self.tokenizer.decode(
+                    sequence,
+                    skip_special_tokens=skip_special_tokens,
+                    clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+                )
+
+                # Remove PADDING prompt of the sequence if XLNet or Transfo-XL model is used
+                if input_ids is None:
+                    prompt_length = 0
+                else:
+                    prompt_length = len(
+                        self.tokenizer.decode(
+                            input_ids[0],
+                            skip_special_tokens=skip_special_tokens,
+                            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+                        )
+                    )
+
+                all_text = text[prompt_length:]
+                if return_type == ReturnType.FULL_TEXT:
+                    if isinstance(prompt_text, str):
+                        all_text = prompt_text + all_text
+                    elif isinstance(prompt_text, Chat):
+                        if continue_final_message is None:
+                            # If the user passes a chat ending in an assistant message, we treat it as a prefill by
+                            # default because very few models support multiple separate, consecutive assistant messages
+                            continue_final_message = prompt_text.messages[-1]["role"] == "assistant"
+                        if continue_final_message:
+                            # With assistant prefill, concat onto the end of the last message
+                            all_text = list(prompt_text.messages)[:-1] + [
+                                {
+                                    "role": prompt_text.messages[-1]["role"],
+                                    "content": prompt_text.messages[-1]["content"] + all_text,
+                                }
+                            ]
+                        else:
+                            # When we're not starting from a prefill, the output is a new assistant message
+                            all_text = list(prompt_text.messages) + [{"role": "assistant", "content": all_text}]
+                record = {"generated_text": all_text}
+                for key, values in split_keys.items():
+                    record[key] = values[idx]
+            records.append(record)
+
+        return records
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/text_to_audio.py b/venv/lib/python3.13/site-packages/transformers/pipelines/text_to_audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..17eaba1466b34811e894064259e2045d330d6b9d
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/text_to_audio.py
@@ -0,0 +1,277 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.from typing import List, Union
+from typing import Any, Union, overload
+
+from ..generation import GenerationConfig
+from ..utils import is_torch_available
+from .base import Pipeline
+
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING
+    from ..models.speecht5.modeling_speecht5 import SpeechT5HifiGan
+
+DEFAULT_VOCODER_ID = "microsoft/speecht5_hifigan"
+
+
+class TextToAudioPipeline(Pipeline):
+    """
+    Text-to-audio generation pipeline using any `AutoModelForTextToWaveform` or `AutoModelForTextToSpectrogram`. This
+    pipeline generates an audio file from an input text and optional other conditional inputs.
+
+    Unless the model you're using explicitly sets these generation parameters in its configuration files
+    (`generation_config.json`), the following default values will be used:
+    - max_new_tokens: 256
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> pipe = pipeline(model="suno/bark-small")
+    >>> output = pipe("Hey it's HuggingFace on the phone!")
+
+    >>> audio = output["audio"]
+    >>> sampling_rate = output["sampling_rate"]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    <Tip>
+
+    You can specify parameters passed to the model by using [`TextToAudioPipeline.__call__.forward_params`] or
+    [`TextToAudioPipeline.__call__.generate_kwargs`].
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> music_generator = pipeline(task="text-to-audio", model="facebook/musicgen-small", framework="pt")
+
+    >>> # diversify the music generation by adding randomness with a high temperature and set a maximum music length
+    >>> generate_kwargs = {
+    ...     "do_sample": True,
+    ...     "temperature": 0.7,
+    ...     "max_new_tokens": 35,
+    ... }
+
+    >>> outputs = music_generator("Techno music with high melodic riffs", generate_kwargs=generate_kwargs)
+    ```
+
+    </Tip>
+
+    This pipeline can currently be loaded from [`pipeline`] using the following task identifiers: `"text-to-speech"` or
+    `"text-to-audio"`.
+
+    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=text-to-speech).
+    """
+
+    # Introducing the processor at load time for new behaviour
+    _load_processor = True
+
+    _pipeline_calls_generate = True
+    _load_processor = False
+    _load_image_processor = False
+    _load_feature_extractor = False
+    _load_tokenizer = True
+
+    # Make sure the docstring is updated when the default generation config is changed
+    _default_generation_config = GenerationConfig(
+        max_new_tokens=256,
+    )
+
+    def __init__(self, *args, vocoder=None, sampling_rate=None, no_processor=True, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # Legacy behaviour just uses the tokenizer while new models use the processor as a whole at any given time
+        self.no_processor = no_processor
+
+        if self.framework == "tf":
+            raise ValueError("The TextToAudioPipeline is only available in PyTorch.")
+
+        self.vocoder = None
+        if self.model.__class__ in MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING.values():
+            self.vocoder = (
+                SpeechT5HifiGan.from_pretrained(DEFAULT_VOCODER_ID).to(self.model.device)
+                if vocoder is None
+                else vocoder
+            )
+
+        self.sampling_rate = sampling_rate
+        if self.vocoder is not None:
+            self.sampling_rate = self.vocoder.config.sampling_rate
+
+        if self.sampling_rate is None:
+            # get sampling_rate from config and generation config
+
+            config = self.model.config
+            gen_config = self.model.__dict__.get("generation_config", None)
+            if gen_config is not None:
+                config.update(gen_config.to_dict())
+
+            for sampling_rate_name in ["sample_rate", "sampling_rate"]:
+                sampling_rate = getattr(config, sampling_rate_name, None)
+                if sampling_rate is not None:
+                    self.sampling_rate = sampling_rate
+                elif getattr(config, "codec_config", None) is not None:
+                    sampling_rate = getattr(config.codec_config, sampling_rate_name, None)
+                    if sampling_rate is not None:
+                        self.sampling_rate = sampling_rate
+
+        # last fallback to get the sampling rate based on processor
+        if self.sampling_rate is None and not self.no_processor and hasattr(self.processor, "feature_extractor"):
+            self.sampling_rate = self.processor.feature_extractor.sampling_rate
+
+    def preprocess(self, text, **kwargs):
+        if isinstance(text, str):
+            text = [text]
+
+        if self.model.config.model_type == "bark":
+            # bark Tokenizer is called with BarkProcessor which uses those kwargs
+            new_kwargs = {
+                "max_length": self.generation_config.semantic_config.get("max_input_semantic_length", 256),
+                "add_special_tokens": False,
+                "return_attention_mask": True,
+                "return_token_type_ids": False,
+                "padding": "max_length",
+            }
+
+            # priority is given to kwargs
+            new_kwargs.update(kwargs)
+
+            kwargs = new_kwargs
+
+        preprocessor = self.tokenizer if self.no_processor else self.processor
+        output = preprocessor(text, **kwargs, return_tensors="pt")
+
+        return output
+
+    def _forward(self, model_inputs, **kwargs):
+        # we expect some kwargs to be additional tensors which need to be on the right device
+        kwargs = self._ensure_tensor_on_device(kwargs, device=self.device)
+        forward_params = kwargs["forward_params"]
+        generate_kwargs = kwargs["generate_kwargs"]
+
+        if self.model.can_generate():
+            # we expect some kwargs to be additional tensors which need to be on the right device
+            generate_kwargs = self._ensure_tensor_on_device(generate_kwargs, device=self.device)
+
+            # User-defined `generation_config` passed to the pipeline call take precedence
+            if "generation_config" not in generate_kwargs:
+                generate_kwargs["generation_config"] = self.generation_config
+
+            # generate_kwargs get priority over forward_params
+            forward_params.update(generate_kwargs)
+
+            output = self.model.generate(**model_inputs, **forward_params)
+        else:
+            if len(generate_kwargs):
+                raise ValueError(
+                    "You're using the `TextToAudioPipeline` with a forward-only model, but `generate_kwargs` is non "
+                    "empty. For forward-only TTA models, please use `forward_params` instead of `generate_kwargs`. "
+                    f"For reference, the `generate_kwargs` used here are: {generate_kwargs.keys()}"
+                )
+            output = self.model(**model_inputs, **forward_params)[0]
+
+        if self.vocoder is not None:
+            # in that case, the output is a spectrogram that needs to be converted into a waveform
+            output = self.vocoder(output)
+
+        return output
+
+    @overload
+    def __call__(self, text_inputs: str, **forward_params: Any) -> dict[str, Any]: ...
+
+    @overload
+    def __call__(self, text_inputs: list[str], **forward_params: Any) -> list[dict[str, Any]]: ...
+
+    def __call__(
+        self, text_inputs: Union[str, list[str]], **forward_params
+    ) -> Union[dict[str, Any], list[dict[str, Any]]]:
+        """
+        Generates speech/audio from the inputs. See the [`TextToAudioPipeline`] documentation for more information.
+
+        Args:
+            text_inputs (`str` or `list[str]`):
+                The text(s) to generate.
+            forward_params (`dict`, *optional*):
+                Parameters passed to the model generation/forward method. `forward_params` are always passed to the
+                underlying model.
+            generate_kwargs (`dict`, *optional*):
+                The dictionary of ad-hoc parametrization of `generate_config` to be used for the generation call. For a
+                complete overview of generate, check the [following
+                guide](https://huggingface.co/docs/transformers/en/main_classes/text_generation). `generate_kwargs` are
+                only passed to the underlying model if the latter is a generative model.
+
+        Return:
+            A `dict` or a list of `dict`: The dictionaries have two keys:
+
+            - **audio** (`np.ndarray` of shape `(nb_channels, audio_length)`) -- The generated audio waveform.
+            - **sampling_rate** (`int`) -- The sampling rate of the generated audio waveform.
+        """
+        return super().__call__(text_inputs, **forward_params)
+
+    def _sanitize_parameters(
+        self,
+        preprocess_params=None,
+        forward_params=None,
+        generate_kwargs=None,
+    ):
+        if getattr(self, "assistant_model", None) is not None:
+            generate_kwargs["assistant_model"] = self.assistant_model
+        if getattr(self, "assistant_tokenizer", None) is not None:
+            generate_kwargs["tokenizer"] = self.tokenizer
+            generate_kwargs["assistant_tokenizer"] = self.assistant_tokenizer
+
+        params = {
+            "forward_params": forward_params if forward_params else {},
+            "generate_kwargs": generate_kwargs if generate_kwargs else {},
+        }
+
+        if preprocess_params is None:
+            preprocess_params = {}
+        postprocess_params = {}
+
+        return preprocess_params, params, postprocess_params
+
+    def postprocess(self, audio):
+        output_dict = {}
+
+        if self.model.config.model_type == "csm":
+            waveform_key = "audio"
+        else:
+            waveform_key = "waveform"
+
+        # We directly get the waveform
+        if self.no_processor:
+            if isinstance(audio, dict):
+                waveform = audio[waveform_key]
+            elif isinstance(audio, tuple):
+                waveform = audio[0]
+            else:
+                waveform = audio
+        # Or we need to postprocess to get the waveform
+        else:
+            waveform = self.processor.decode(audio)
+
+        if isinstance(audio, list):
+            output_dict["audio"] = [el.to(device="cpu", dtype=torch.float).numpy() for el in waveform]
+        else:
+            output_dict["audio"] = waveform.to(device="cpu", dtype=torch.float).numpy()
+        output_dict["sampling_rate"] = self.sampling_rate
+
+        return output_dict
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/token_classification.py b/venv/lib/python3.13/site-packages/transformers/pipelines/token_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..31ba1c481107885089709f9e4a52054d26c7aca8
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/token_classification.py
@@ -0,0 +1,668 @@
+import types
+import warnings
+from typing import Any, Optional, Union, overload
+
+import numpy as np
+
+from ..models.bert.tokenization_bert import BasicTokenizer
+from ..utils import (
+    ExplicitEnum,
+    add_end_docstrings,
+    is_tf_available,
+    is_torch_available,
+)
+from .base import ArgumentHandler, ChunkPipeline, Dataset, build_pipeline_init_args
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
+
+
+class TokenClassificationArgumentHandler(ArgumentHandler):
+    """
+    Handles arguments for token classification.
+    """
+
+    def __call__(self, inputs: Union[str, list[str]], **kwargs):
+        is_split_into_words = kwargs.get("is_split_into_words", False)
+        delimiter = kwargs.get("delimiter")
+
+        if inputs is not None and isinstance(inputs, (list, tuple)) and len(inputs) > 0:
+            inputs = list(inputs)
+            batch_size = len(inputs)
+        elif isinstance(inputs, str):
+            inputs = [inputs]
+            batch_size = 1
+        elif Dataset is not None and isinstance(inputs, Dataset) or isinstance(inputs, types.GeneratorType):
+            return inputs, is_split_into_words, None, delimiter
+        else:
+            raise ValueError("At least one input is required.")
+
+        offset_mapping = kwargs.get("offset_mapping")
+        if offset_mapping:
+            if isinstance(offset_mapping, list) and isinstance(offset_mapping[0], tuple):
+                offset_mapping = [offset_mapping]
+            if len(offset_mapping) != batch_size:
+                raise ValueError("offset_mapping should have the same batch size as the input")
+        return inputs, is_split_into_words, offset_mapping, delimiter
+
+
+class AggregationStrategy(ExplicitEnum):
+    """All the valid aggregation strategies for TokenClassificationPipeline"""
+
+    NONE = "none"
+    SIMPLE = "simple"
+    FIRST = "first"
+    AVERAGE = "average"
+    MAX = "max"
+
+
+@add_end_docstrings(
+    build_pipeline_init_args(has_tokenizer=True),
+    r"""
+        ignore_labels (`list[str]`, defaults to `["O"]`):
+            A list of labels to ignore.
+        grouped_entities (`bool`, *optional*, defaults to `False`):
+            DEPRECATED, use `aggregation_strategy` instead. Whether or not to group the tokens corresponding to the
+            same entity together in the predictions or not.
+        stride (`int`, *optional*):
+            If stride is provided, the pipeline is applied on all the text. The text is split into chunks of size
+            model_max_length. Works only with fast tokenizers and `aggregation_strategy` different from `NONE`. The
+            value of this argument defines the number of overlapping tokens between chunks. In other words, the model
+            will shift forward by `tokenizer.model_max_length - stride` tokens each step.
+        aggregation_strategy (`str`, *optional*, defaults to `"none"`):
+            The strategy to fuse (or not) tokens based on the model prediction.
+
+                - "none" : Will simply not do any aggregation and simply return raw results from the model
+                - "simple" : Will attempt to group entities following the default schema. (A, B-TAG), (B, I-TAG), (C,
+                  I-TAG), (D, B-TAG2) (E, B-TAG2) will end up being [{"word": ABC, "entity": "TAG"}, {"word": "D",
+                  "entity": "TAG2"}, {"word": "E", "entity": "TAG2"}] Notice that two consecutive B tags will end up as
+                  different entities. On word based languages, we might end up splitting words undesirably : Imagine
+                  Microsoft being tagged as [{"word": "Micro", "entity": "ENTERPRISE"}, {"word": "soft", "entity":
+                  "NAME"}]. Look for FIRST, MAX, AVERAGE for ways to mitigate that and disambiguate words (on languages
+                  that support that meaning, which is basically tokens separated by a space). These mitigations will
+                  only work on real words, "New york" might still be tagged with two different entities.
+                - "first" : (works only on word based models) Will use the `SIMPLE` strategy except that words, cannot
+                  end up with different tags. Words will simply use the tag of the first token of the word when there
+                  is ambiguity.
+                - "average" : (works only on word based models) Will use the `SIMPLE` strategy except that words,
+                  cannot end up with different tags. scores will be averaged first across tokens, and then the maximum
+                  label is applied.
+                - "max" : (works only on word based models) Will use the `SIMPLE` strategy except that words, cannot
+                  end up with different tags. Word entity will simply be the token with the maximum score.""",
+)
+class TokenClassificationPipeline(ChunkPipeline):
+    """
+    Named Entity Recognition pipeline using any `ModelForTokenClassification`. See the [named entity recognition
+    examples](../task_summary#named-entity-recognition) for more information.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> token_classifier = pipeline(model="Jean-Baptiste/camembert-ner", aggregation_strategy="simple")
+    >>> sentence = "Je m'appelle jean-baptiste et je vis à montréal"
+    >>> tokens = token_classifier(sentence)
+    >>> tokens
+    [{'entity_group': 'PER', 'score': 0.9931, 'word': 'jean-baptiste', 'start': 12, 'end': 26}, {'entity_group': 'LOC', 'score': 0.998, 'word': 'montréal', 'start': 38, 'end': 47}]
+
+    >>> token = tokens[0]
+    >>> # Start and end provide an easy way to highlight words in the original text.
+    >>> sentence[token["start"] : token["end"]]
+    ' jean-baptiste'
+
+    >>> # Some models use the same idea to do part of speech.
+    >>> syntaxer = pipeline(model="vblagoje/bert-english-uncased-finetuned-pos", aggregation_strategy="simple")
+    >>> syntaxer("My name is Sarah and I live in London")
+    [{'entity_group': 'PRON', 'score': 0.999, 'word': 'my', 'start': 0, 'end': 2}, {'entity_group': 'NOUN', 'score': 0.997, 'word': 'name', 'start': 3, 'end': 7}, {'entity_group': 'AUX', 'score': 0.994, 'word': 'is', 'start': 8, 'end': 10}, {'entity_group': 'PROPN', 'score': 0.999, 'word': 'sarah', 'start': 11, 'end': 16}, {'entity_group': 'CCONJ', 'score': 0.999, 'word': 'and', 'start': 17, 'end': 20}, {'entity_group': 'PRON', 'score': 0.999, 'word': 'i', 'start': 21, 'end': 22}, {'entity_group': 'VERB', 'score': 0.998, 'word': 'live', 'start': 23, 'end': 27}, {'entity_group': 'ADP', 'score': 0.999, 'word': 'in', 'start': 28, 'end': 30}, {'entity_group': 'PROPN', 'score': 0.999, 'word': 'london', 'start': 31, 'end': 37}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This token recognition pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"ner"` (for predicting the classes of tokens in a sequence: person, organisation, location or miscellaneous).
+
+    The models that this pipeline can use are models that have been fine-tuned on a token classification task. See the
+    up-to-date list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=token-classification).
+    """
+
+    default_input_names = "sequences"
+
+    _load_processor = False
+    _load_image_processor = False
+    _load_feature_extractor = False
+    _load_tokenizer = True
+
+    def __init__(self, args_parser=TokenClassificationArgumentHandler(), **kwargs):
+        super().__init__(**kwargs)
+
+        self.check_model_type(
+            TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
+            if self.framework == "tf"
+            else MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
+        )
+
+        self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
+        self._args_parser = args_parser
+
+    def _sanitize_parameters(
+        self,
+        ignore_labels=None,
+        grouped_entities: Optional[bool] = None,
+        ignore_subwords: Optional[bool] = None,
+        aggregation_strategy: Optional[AggregationStrategy] = None,
+        offset_mapping: Optional[list[tuple[int, int]]] = None,
+        is_split_into_words: bool = False,
+        stride: Optional[int] = None,
+        delimiter: Optional[str] = None,
+    ):
+        preprocess_params = {}
+        preprocess_params["is_split_into_words"] = is_split_into_words
+
+        if is_split_into_words:
+            preprocess_params["delimiter"] = " " if delimiter is None else delimiter
+
+        if offset_mapping is not None:
+            preprocess_params["offset_mapping"] = offset_mapping
+
+        postprocess_params = {}
+        if grouped_entities is not None or ignore_subwords is not None:
+            if grouped_entities and ignore_subwords:
+                aggregation_strategy = AggregationStrategy.FIRST
+            elif grouped_entities and not ignore_subwords:
+                aggregation_strategy = AggregationStrategy.SIMPLE
+            else:
+                aggregation_strategy = AggregationStrategy.NONE
+
+            if grouped_entities is not None:
+                warnings.warn(
+                    "`grouped_entities` is deprecated and will be removed in version v5.0.0, defaulted to"
+                    f' `aggregation_strategy="{aggregation_strategy}"` instead.'
+                )
+            if ignore_subwords is not None:
+                warnings.warn(
+                    "`ignore_subwords` is deprecated and will be removed in version v5.0.0, defaulted to"
+                    f' `aggregation_strategy="{aggregation_strategy}"` instead.'
+                )
+
+        if aggregation_strategy is not None:
+            if isinstance(aggregation_strategy, str):
+                aggregation_strategy = AggregationStrategy[aggregation_strategy.upper()]
+            if (
+                aggregation_strategy
+                in {AggregationStrategy.FIRST, AggregationStrategy.MAX, AggregationStrategy.AVERAGE}
+                and not self.tokenizer.is_fast
+            ):
+                raise ValueError(
+                    "Slow tokenizers cannot handle subwords. Please set the `aggregation_strategy` option"
+                    ' to `"simple"` or use a fast tokenizer.'
+                )
+            postprocess_params["aggregation_strategy"] = aggregation_strategy
+        if ignore_labels is not None:
+            postprocess_params["ignore_labels"] = ignore_labels
+        if stride is not None:
+            if stride >= self.tokenizer.model_max_length:
+                raise ValueError(
+                    "`stride` must be less than `tokenizer.model_max_length` (or even lower if the tokenizer adds special tokens)"
+                )
+            if aggregation_strategy == AggregationStrategy.NONE:
+                raise ValueError(
+                    "`stride` was provided to process all the text but `aggregation_strategy="
+                    f'"{aggregation_strategy}"`, please select another one instead.'
+                )
+            else:
+                if self.tokenizer.is_fast:
+                    tokenizer_params = {
+                        "return_overflowing_tokens": True,
+                        "padding": True,
+                        "stride": stride,
+                    }
+                    preprocess_params["tokenizer_params"] = tokenizer_params
+                else:
+                    raise ValueError(
+                        "`stride` was provided to process all the text but you're using a slow tokenizer."
+                        " Please use a fast tokenizer."
+                    )
+        return preprocess_params, {}, postprocess_params
+
+    @overload
+    def __call__(self, inputs: str, **kwargs: Any) -> list[dict[str, str]]: ...
+
+    @overload
+    def __call__(self, inputs: list[str], **kwargs: Any) -> list[list[dict[str, str]]]: ...
+
+    def __call__(
+        self, inputs: Union[str, list[str]], **kwargs: Any
+    ) -> Union[list[dict[str, str]], list[list[dict[str, str]]]]:
+        """
+        Classify each token of the text(s) given as inputs.
+
+        Args:
+            inputs (`str` or `List[str]`):
+                One or several texts (or one list of texts) for token classification. Can be pre-tokenized when
+                `is_split_into_words=True`.
+
+        Return:
+            A list or a list of list of `dict`: Each result comes as a list of dictionaries (one for each token in the
+            corresponding input, or each entity if this pipeline was instantiated with an aggregation_strategy) with
+            the following keys:
+
+            - **word** (`str`) -- The token/word classified. This is obtained by decoding the selected tokens. If you
+              want to have the exact string in the original sentence, use `start` and `end`.
+            - **score** (`float`) -- The corresponding probability for `entity`.
+            - **entity** (`str`) -- The entity predicted for that token/word (it is named *entity_group* when
+              *aggregation_strategy* is not `"none"`.
+            - **index** (`int`, only present when `aggregation_strategy="none"`) -- The index of the corresponding
+              token in the sentence.
+            - **start** (`int`, *optional*) -- The index of the start of the corresponding entity in the sentence. Only
+              exists if the offsets are available within the tokenizer
+            - **end** (`int`, *optional*) -- The index of the end of the corresponding entity in the sentence. Only
+              exists if the offsets are available within the tokenizer
+        """
+
+        _inputs, is_split_into_words, offset_mapping, delimiter = self._args_parser(inputs, **kwargs)
+        kwargs["is_split_into_words"] = is_split_into_words
+        kwargs["delimiter"] = delimiter
+        if is_split_into_words and not all(isinstance(input, list) for input in inputs):
+            return super().__call__([inputs], **kwargs)
+        if offset_mapping:
+            kwargs["offset_mapping"] = offset_mapping
+
+        return super().__call__(inputs, **kwargs)
+
+    def preprocess(self, sentence, offset_mapping=None, **preprocess_params):
+        tokenizer_params = preprocess_params.pop("tokenizer_params", {})
+        truncation = self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0
+
+        word_to_chars_map = None
+        is_split_into_words = preprocess_params["is_split_into_words"]
+        if is_split_into_words:
+            delimiter = preprocess_params["delimiter"]
+            if not isinstance(sentence, list):
+                raise ValueError("When `is_split_into_words=True`, `sentence` must be a list of tokens.")
+            words = sentence
+            sentence = delimiter.join(words)  # Recreate the sentence string for later display and slicing
+            # This map will allows to convert back word => char indices
+            word_to_chars_map = []
+            delimiter_len = len(delimiter)
+            char_offset = 0
+            for word in words:
+                word_to_chars_map.append((char_offset, char_offset + len(word)))
+                char_offset += len(word) + delimiter_len
+
+            # We use `words` as the actual input for the tokenizer
+            text_to_tokenize = words
+            tokenizer_params["is_split_into_words"] = True
+        else:
+            if not isinstance(sentence, str):
+                raise ValueError("When `is_split_into_words=False`, `sentence` must be an untokenized string.")
+            text_to_tokenize = sentence
+
+        inputs = self.tokenizer(
+            text_to_tokenize,
+            return_tensors=self.framework,
+            truncation=truncation,
+            return_special_tokens_mask=True,
+            return_offsets_mapping=self.tokenizer.is_fast,
+            **tokenizer_params,
+        )
+
+        if is_split_into_words and not self.tokenizer.is_fast:
+            raise ValueError("is_split_into_words=True is only supported with fast tokenizers.")
+
+        inputs.pop("overflow_to_sample_mapping", None)
+        num_chunks = len(inputs["input_ids"])
+
+        for i in range(num_chunks):
+            if self.framework == "tf":
+                model_inputs = {k: tf.expand_dims(v[i], 0) for k, v in inputs.items()}
+            else:
+                model_inputs = {k: v[i].unsqueeze(0) for k, v in inputs.items()}
+            if offset_mapping is not None:
+                model_inputs["offset_mapping"] = offset_mapping
+
+            model_inputs["sentence"] = sentence if i == 0 else None
+            model_inputs["is_last"] = i == num_chunks - 1
+            if word_to_chars_map is not None:
+                model_inputs["word_ids"] = inputs.word_ids(i)
+                model_inputs["word_to_chars_map"] = word_to_chars_map
+
+            yield model_inputs
+
+    def _forward(self, model_inputs):
+        # Forward
+        special_tokens_mask = model_inputs.pop("special_tokens_mask")
+        offset_mapping = model_inputs.pop("offset_mapping", None)
+        sentence = model_inputs.pop("sentence")
+        is_last = model_inputs.pop("is_last")
+        word_ids = model_inputs.pop("word_ids", None)
+        word_to_chars_map = model_inputs.pop("word_to_chars_map", None)
+
+        if self.framework == "tf":
+            logits = self.model(**model_inputs)[0]
+        else:
+            output = self.model(**model_inputs)
+            logits = output["logits"] if isinstance(output, dict) else output[0]
+
+        return {
+            "logits": logits,
+            "special_tokens_mask": special_tokens_mask,
+            "offset_mapping": offset_mapping,
+            "sentence": sentence,
+            "is_last": is_last,
+            "word_ids": word_ids,
+            "word_to_chars_map": word_to_chars_map,
+            **model_inputs,
+        }
+
+    def postprocess(self, all_outputs, aggregation_strategy=AggregationStrategy.NONE, ignore_labels=None):
+        if ignore_labels is None:
+            ignore_labels = ["O"]
+        all_entities = []
+
+        # Get map from the first output, it's the same for all chunks
+        word_to_chars_map = all_outputs[0].get("word_to_chars_map")
+
+        for model_outputs in all_outputs:
+            if self.framework == "pt" and model_outputs["logits"][0].dtype in (torch.bfloat16, torch.float16):
+                logits = model_outputs["logits"][0].to(torch.float32).numpy()
+            else:
+                logits = model_outputs["logits"][0].numpy()
+
+            sentence = all_outputs[0]["sentence"]
+            input_ids = model_outputs["input_ids"][0]
+            offset_mapping = (
+                model_outputs["offset_mapping"][0] if model_outputs["offset_mapping"] is not None else None
+            )
+            special_tokens_mask = model_outputs["special_tokens_mask"][0].numpy()
+            word_ids = model_outputs.get("word_ids")
+
+            maxes = np.max(logits, axis=-1, keepdims=True)
+            shifted_exp = np.exp(logits - maxes)
+            scores = shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
+
+            if self.framework == "tf":
+                input_ids = input_ids.numpy()
+                offset_mapping = offset_mapping.numpy() if offset_mapping is not None else None
+
+            pre_entities = self.gather_pre_entities(
+                sentence,
+                input_ids,
+                scores,
+                offset_mapping,
+                special_tokens_mask,
+                aggregation_strategy,
+                word_ids=word_ids,
+                word_to_chars_map=word_to_chars_map,
+            )
+            grouped_entities = self.aggregate(pre_entities, aggregation_strategy)
+            # Filter anything that is in self.ignore_labels
+            entities = [
+                entity
+                for entity in grouped_entities
+                if entity.get("entity", None) not in ignore_labels
+                and entity.get("entity_group", None) not in ignore_labels
+            ]
+            all_entities.extend(entities)
+        num_chunks = len(all_outputs)
+        if num_chunks > 1:
+            all_entities = self.aggregate_overlapping_entities(all_entities)
+        return all_entities
+
+    def aggregate_overlapping_entities(self, entities):
+        if len(entities) == 0:
+            return entities
+        entities = sorted(entities, key=lambda x: x["start"])
+        aggregated_entities = []
+        previous_entity = entities[0]
+        for entity in entities:
+            if previous_entity["start"] <= entity["start"] < previous_entity["end"]:
+                current_length = entity["end"] - entity["start"]
+                previous_length = previous_entity["end"] - previous_entity["start"]
+                if (
+                    current_length > previous_length
+                    or current_length == previous_length
+                    and entity["score"] > previous_entity["score"]
+                ):
+                    previous_entity = entity
+            else:
+                aggregated_entities.append(previous_entity)
+                previous_entity = entity
+        aggregated_entities.append(previous_entity)
+        return aggregated_entities
+
+    def gather_pre_entities(
+        self,
+        sentence: str,
+        input_ids: np.ndarray,
+        scores: np.ndarray,
+        offset_mapping: Optional[list[tuple[int, int]]],
+        special_tokens_mask: np.ndarray,
+        aggregation_strategy: AggregationStrategy,
+        word_ids: Optional[list[Optional[int]]] = None,
+        word_to_chars_map: Optional[list[tuple[int, int]]] = None,
+    ) -> list[dict]:
+        """Fuse various numpy arrays into dicts with all the information needed for aggregation"""
+        pre_entities = []
+        for idx, token_scores in enumerate(scores):
+            # Filter special_tokens
+            if special_tokens_mask[idx]:
+                continue
+
+            word = self.tokenizer.convert_ids_to_tokens(int(input_ids[idx]))
+            if offset_mapping is not None:
+                start_ind, end_ind = offset_mapping[idx]
+
+                # If the input is pre-tokenized, we need to rescale the offsets to the absolute sentence.
+                if word_ids is not None and word_to_chars_map is not None:
+                    word_index = word_ids[idx]
+                    if word_index is not None:
+                        start_char, _ = word_to_chars_map[word_index]
+                        start_ind += start_char
+                        end_ind += start_char
+
+                if not isinstance(start_ind, int):
+                    if self.framework == "pt":
+                        start_ind = start_ind.item()
+                        end_ind = end_ind.item()
+                word_ref = sentence[start_ind:end_ind]
+                if getattr(self.tokenizer, "_tokenizer", None) and getattr(
+                    self.tokenizer._tokenizer.model, "continuing_subword_prefix", None
+                ):
+                    # This is a BPE, word aware tokenizer, there is a correct way
+                    # to fuse tokens
+                    is_subword = len(word) != len(word_ref)
+                else:
+                    # This is a fallback heuristic. This will fail most likely on any kind of text + punctuation mixtures that will be considered "words". Non word aware models cannot do better than this unfortunately.
+                    if aggregation_strategy in {
+                        AggregationStrategy.FIRST,
+                        AggregationStrategy.AVERAGE,
+                        AggregationStrategy.MAX,
+                    }:
+                        warnings.warn(
+                            "Tokenizer does not support real words, using fallback heuristic",
+                            UserWarning,
+                        )
+                    is_subword = start_ind > 0 and " " not in sentence[start_ind - 1 : start_ind + 1]
+
+                if int(input_ids[idx]) == self.tokenizer.unk_token_id:
+                    word = word_ref
+                    is_subword = False
+            else:
+                start_ind = None
+                end_ind = None
+                is_subword = False
+
+            pre_entity = {
+                "word": word,
+                "scores": token_scores,
+                "start": start_ind,
+                "end": end_ind,
+                "index": idx,
+                "is_subword": is_subword,
+            }
+            pre_entities.append(pre_entity)
+        return pre_entities
+
+    def aggregate(self, pre_entities: list[dict], aggregation_strategy: AggregationStrategy) -> list[dict]:
+        if aggregation_strategy in {AggregationStrategy.NONE, AggregationStrategy.SIMPLE}:
+            entities = []
+            for pre_entity in pre_entities:
+                entity_idx = pre_entity["scores"].argmax()
+                score = pre_entity["scores"][entity_idx]
+                entity = {
+                    "entity": self.model.config.id2label[entity_idx],
+                    "score": score,
+                    "index": pre_entity["index"],
+                    "word": pre_entity["word"],
+                    "start": pre_entity["start"],
+                    "end": pre_entity["end"],
+                }
+                entities.append(entity)
+        else:
+            entities = self.aggregate_words(pre_entities, aggregation_strategy)
+
+        if aggregation_strategy == AggregationStrategy.NONE:
+            return entities
+        return self.group_entities(entities)
+
+    def aggregate_word(self, entities: list[dict], aggregation_strategy: AggregationStrategy) -> dict:
+        word = self.tokenizer.convert_tokens_to_string([entity["word"] for entity in entities])
+        if aggregation_strategy == AggregationStrategy.FIRST:
+            scores = entities[0]["scores"]
+            idx = scores.argmax()
+            score = scores[idx]
+            entity = self.model.config.id2label[idx]
+        elif aggregation_strategy == AggregationStrategy.MAX:
+            max_entity = max(entities, key=lambda entity: entity["scores"].max())
+            scores = max_entity["scores"]
+            idx = scores.argmax()
+            score = scores[idx]
+            entity = self.model.config.id2label[idx]
+        elif aggregation_strategy == AggregationStrategy.AVERAGE:
+            scores = np.stack([entity["scores"] for entity in entities])
+            average_scores = np.nanmean(scores, axis=0)
+            entity_idx = average_scores.argmax()
+            entity = self.model.config.id2label[entity_idx]
+            score = average_scores[entity_idx]
+        else:
+            raise ValueError("Invalid aggregation_strategy")
+        new_entity = {
+            "entity": entity,
+            "score": score,
+            "word": word,
+            "start": entities[0]["start"],
+            "end": entities[-1]["end"],
+        }
+        return new_entity
+
+    def aggregate_words(self, entities: list[dict], aggregation_strategy: AggregationStrategy) -> list[dict]:
+        """
+        Override tokens from a given word that disagree to force agreement on word boundaries.
+
+        Example: micro|soft| com|pany| B-ENT I-NAME I-ENT I-ENT will be rewritten with first strategy as microsoft|
+        company| B-ENT I-ENT
+        """
+        if aggregation_strategy in {
+            AggregationStrategy.NONE,
+            AggregationStrategy.SIMPLE,
+        }:
+            raise ValueError("NONE and SIMPLE strategies are invalid for word aggregation")
+
+        word_entities = []
+        word_group = None
+        for entity in entities:
+            if word_group is None:
+                word_group = [entity]
+            elif entity["is_subword"]:
+                word_group.append(entity)
+            else:
+                word_entities.append(self.aggregate_word(word_group, aggregation_strategy))
+                word_group = [entity]
+        # Last item
+        if word_group is not None:
+            word_entities.append(self.aggregate_word(word_group, aggregation_strategy))
+        return word_entities
+
+    def group_sub_entities(self, entities: list[dict]) -> dict:
+        """
+        Group together the adjacent tokens with the same entity predicted.
+
+        Args:
+            entities (`dict`): The entities predicted by the pipeline.
+        """
+        # Get the first entity in the entity group
+        entity = entities[0]["entity"].split("-", 1)[-1]
+        scores = np.nanmean([entity["score"] for entity in entities])
+        tokens = [entity["word"] for entity in entities]
+
+        entity_group = {
+            "entity_group": entity,
+            "score": np.mean(scores),
+            "word": self.tokenizer.convert_tokens_to_string(tokens),
+            "start": entities[0]["start"],
+            "end": entities[-1]["end"],
+        }
+        return entity_group
+
+    def get_tag(self, entity_name: str) -> tuple[str, str]:
+        if entity_name.startswith("B-"):
+            bi = "B"
+            tag = entity_name[2:]
+        elif entity_name.startswith("I-"):
+            bi = "I"
+            tag = entity_name[2:]
+        else:
+            # It's not in B-, I- format
+            # Default to I- for continuation.
+            bi = "I"
+            tag = entity_name
+        return bi, tag
+
+    def group_entities(self, entities: list[dict]) -> list[dict]:
+        """
+        Find and group together the adjacent tokens with the same entity predicted.
+
+        Args:
+            entities (`dict`): The entities predicted by the pipeline.
+        """
+
+        entity_groups = []
+        entity_group_disagg = []
+
+        for entity in entities:
+            if not entity_group_disagg:
+                entity_group_disagg.append(entity)
+                continue
+
+            # If the current entity is similar and adjacent to the previous entity,
+            # append it to the disaggregated entity group
+            # The split is meant to account for the "B" and "I" prefixes
+            # Shouldn't merge if both entities are B-type
+            bi, tag = self.get_tag(entity["entity"])
+            last_bi, last_tag = self.get_tag(entity_group_disagg[-1]["entity"])
+
+            if tag == last_tag and bi != "B":
+                # Modify subword type to be previous_type
+                entity_group_disagg.append(entity)
+            else:
+                # If the current entity is different from the previous entity
+                # aggregate the disaggregated entity group
+                entity_groups.append(self.group_sub_entities(entity_group_disagg))
+                entity_group_disagg = [entity]
+        if entity_group_disagg:
+            # it's the last entity, add it to the entity groups
+            entity_groups.append(self.group_sub_entities(entity_group_disagg))
+
+        return entity_groups
+
+
+NerPipeline = TokenClassificationPipeline
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/video_classification.py b/venv/lib/python3.13/site-packages/transformers/pipelines/video_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ee8dc86e161e3b24f1e71701239f6cc3b2d02a5
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/video_classification.py
@@ -0,0 +1,195 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+from io import BytesIO
+from typing import Any, Optional, Union, overload
+
+import requests
+
+from ..utils import (
+    add_end_docstrings,
+    is_av_available,
+    is_torch_available,
+    logging,
+    requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_av_available():
+    import av
+    import numpy as np
+
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
+class VideoClassificationPipeline(Pipeline):
+    """
+    Video classification pipeline using any `AutoModelForVideoClassification`. This pipeline predicts the class of a
+    video.
+
+    This video classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"video-classification"`.
+
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=video-classification).
+    """
+
+    _load_processor = False
+    _load_image_processor = True
+    _load_feature_extractor = False
+    _load_tokenizer = False
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        requires_backends(self, "av")
+        self.check_model_type(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES)
+
+    def _sanitize_parameters(self, top_k=None, num_frames=None, frame_sampling_rate=None, function_to_apply=None):
+        preprocess_params = {}
+        if frame_sampling_rate is not None:
+            preprocess_params["frame_sampling_rate"] = frame_sampling_rate
+        if num_frames is not None:
+            preprocess_params["num_frames"] = num_frames
+
+        postprocess_params = {}
+        if top_k is not None:
+            postprocess_params["top_k"] = top_k
+        if function_to_apply is not None:
+            if function_to_apply not in ["softmax", "sigmoid", "none"]:
+                raise ValueError(
+                    f"Invalid value for `function_to_apply`: {function_to_apply}. "
+                    "Valid options are ['softmax', 'sigmoid', 'none']"
+                )
+            postprocess_params["function_to_apply"] = function_to_apply
+        else:
+            postprocess_params["function_to_apply"] = "softmax"
+        return preprocess_params, {}, postprocess_params
+
+    @overload
+    def __call__(self, inputs: str, **kwargs: Any) -> list[dict[str, Any]]: ...
+
+    @overload
+    def __call__(self, inputs: list[str], **kwargs: Any) -> list[list[dict[str, Any]]]: ...
+
+    def __call__(self, inputs: Optional[Union[str, list[str]]] = None, **kwargs):
+        """
+        Assign labels to the video(s) passed as inputs.
+
+        Args:
+            inputs (`str`, `list[str]`):
+                The pipeline handles three types of videos:
+
+                - A string containing a http link pointing to a video
+                - A string containing a local path to a video
+
+                The pipeline accepts either a single video or a batch of videos, which must then be passed as a string.
+                Videos in a batch must all be in the same format: all as http links or all as local paths.
+            top_k (`int`, *optional*, defaults to 5):
+                The number of top labels that will be returned by the pipeline. If the provided number is higher than
+                the number of labels available in the model configuration, it will default to the number of labels.
+            num_frames (`int`, *optional*, defaults to `self.model.config.num_frames`):
+                The number of frames sampled from the video to run the classification on. If not provided, will default
+                to the number of frames specified in the model configuration.
+            frame_sampling_rate (`int`, *optional*, defaults to 1):
+                The sampling rate used to select frames from the video. If not provided, will default to 1, i.e. every
+                frame will be used.
+            function_to_apply(`str`, *optional*, defaults to "softmax"):
+                The function to apply to the model output. By default, the pipeline will apply the softmax function to
+                the output of the model. Valid options: ["softmax", "sigmoid", "none"]. Note that passing Python's
+                built-in `None` will default to "softmax", so you need to pass the string "none" to disable any
+                post-processing.
+
+        Return:
+            A list of dictionaries or a list of list of dictionaries containing result. If the input is a single video,
+            will return a list of `top_k` dictionaries, if the input is a list of several videos, will return a list of list of
+            `top_k` dictionaries corresponding to the videos.
+
+            The dictionaries contain the following keys:
+
+            - **label** (`str`) -- The label identified by the model.
+            - **score** (`int`) -- The score attributed by the model for that label.
+        """
+        # After deprecation of this is completed, remove the default `None` value for `images`
+        if "videos" in kwargs:
+            warnings.warn(
+                "The `videos` argument has been renamed to `inputs`. In version 5 of Transformers, `videos` will no longer be accepted",
+                FutureWarning,
+            )
+            inputs = kwargs.pop("videos")
+        if inputs is None:
+            raise ValueError("Cannot call the video-classification pipeline without an inputs argument!")
+        return super().__call__(inputs, **kwargs)
+
+    def preprocess(self, video, num_frames=None, frame_sampling_rate=1):
+        if num_frames is None:
+            num_frames = self.model.config.num_frames
+
+        if video.startswith("http://") or video.startswith("https://"):
+            video = BytesIO(requests.get(video).content)
+
+        container = av.open(video)
+
+        start_idx = 0
+        end_idx = num_frames * frame_sampling_rate - 1
+        indices = np.linspace(start_idx, end_idx, num=num_frames, dtype=np.int64)
+
+        video = read_video_pyav(container, indices)
+        video = list(video)
+
+        model_inputs = self.image_processor(video, return_tensors=self.framework)
+        if self.framework == "pt":
+            model_inputs = model_inputs.to(self.dtype)
+        return model_inputs
+
+    def _forward(self, model_inputs):
+        model_outputs = self.model(**model_inputs)
+        return model_outputs
+
+    def postprocess(self, model_outputs, top_k=5, function_to_apply="softmax"):
+        if top_k > self.model.config.num_labels:
+            top_k = self.model.config.num_labels
+
+        if self.framework == "pt":
+            if function_to_apply == "softmax":
+                probs = model_outputs.logits[0].softmax(-1)
+            elif function_to_apply == "sigmoid":
+                probs = model_outputs.logits[0].sigmoid()
+            else:
+                probs = model_outputs.logits[0]
+            scores, ids = probs.topk(top_k)
+        else:
+            raise ValueError(f"Unsupported framework: {self.framework}")
+
+        scores = scores.tolist()
+        ids = ids.tolist()
+        return [{"score": score, "label": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)]
+
+
+def read_video_pyav(container, indices):
+    frames = []
+    container.seek(0)
+    start_index = indices[0]
+    end_index = indices[-1]
+    for i, frame in enumerate(container.decode(video=0)):
+        if i > end_index:
+            break
+        if i >= start_index and i in indices:
+            frames.append(frame)
+    return np.stack([x.to_ndarray(format="rgb24") for x in frames])
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/visual_question_answering.py b/venv/lib/python3.13/site-packages/transformers/pipelines/visual_question_answering.py
new file mode 100644
index 0000000000000000000000000000000000000000..609eaf2e9d554c3215c4867060779376fccc218a
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/visual_question_answering.py
@@ -0,0 +1,216 @@
+from typing import Optional, Union
+
+from ..generation import GenerationConfig
+from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES
+    from .pt_utils import KeyDataset
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True, has_image_processor=True))
+class VisualQuestionAnsweringPipeline(Pipeline):
+    """
+    Visual Question Answering pipeline using a `AutoModelForVisualQuestionAnswering`. This pipeline is currently only
+    available in PyTorch.
+
+    Unless the model you're using explicitly sets these generation parameters in its configuration files
+    (`generation_config.json`), the following default values will be used:
+    - max_new_tokens: 256
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> oracle = pipeline(model="dandelin/vilt-b32-finetuned-vqa")
+    >>> image_url = "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/lena.png"
+    >>> oracle(question="What is she wearing ?", image=image_url)
+    [{'score': 0.948, 'answer': 'hat'}, {'score': 0.009, 'answer': 'fedora'}, {'score': 0.003, 'answer': 'clothes'}, {'score': 0.003, 'answer': 'sun hat'}, {'score': 0.002, 'answer': 'nothing'}]
+
+    >>> oracle(question="What is she wearing ?", image=image_url, top_k=1)
+    [{'score': 0.948, 'answer': 'hat'}]
+
+    >>> oracle(question="Is this a person ?", image=image_url, top_k=1)
+    [{'score': 0.993, 'answer': 'yes'}]
+
+    >>> oracle(question="Is this a man ?", image=image_url, top_k=1)
+    [{'score': 0.996, 'answer': 'no'}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This visual question answering pipeline can currently be loaded from [`pipeline`] using the following task
+    identifiers: `"visual-question-answering", "vqa"`.
+
+    The models that this pipeline can use are models that have been fine-tuned on a visual question answering task. See
+    the up-to-date list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=visual-question-answering).
+    """
+
+    _load_processor = False
+    _load_image_processor = True
+    _load_feature_extractor = False
+    _load_tokenizer = True
+
+    _pipeline_calls_generate = True
+    # Make sure the docstring is updated when the default generation config is changed
+    _default_generation_config = GenerationConfig(
+        max_new_tokens=256,
+    )
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.check_model_type(MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES)
+
+    def _sanitize_parameters(self, top_k=None, padding=None, truncation=None, timeout=None, **kwargs):
+        preprocess_params, postprocess_params = {}, {}
+        if padding is not None:
+            preprocess_params["padding"] = padding
+        if truncation is not None:
+            preprocess_params["truncation"] = truncation
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
+        if top_k is not None:
+            postprocess_params["top_k"] = top_k
+
+        forward_params = {}
+        if getattr(self, "assistant_model", None) is not None:
+            forward_params["assistant_model"] = self.assistant_model
+        if getattr(self, "assistant_tokenizer", None) is not None:
+            forward_params["tokenizer"] = self.tokenizer
+            forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+        return preprocess_params, forward_params, postprocess_params
+
+    def __call__(
+        self,
+        image: Union["Image.Image", str, list["Image.Image"], list[str], "KeyDataset"],
+        question: Optional[Union[str, list[str]]] = None,
+        **kwargs,
+    ):
+        r"""
+        Answers open-ended questions about images. The pipeline accepts several types of inputs which are detailed
+        below:
+
+        - `pipeline(image=image, question=question)`
+        - `pipeline({"image": image, "question": question})`
+        - `pipeline([{"image": image, "question": question}])`
+        - `pipeline([{"image": image, "question": question}, {"image": image, "question": question}])`
+
+        Args:
+            image (`str`, `list[str]`, `PIL.Image`, `list[PIL.Image]` or `KeyDataset`):
+                The pipeline handles three types of images:
+
+                - A string containing a http link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images. If given a single image, it can be
+                broadcasted to multiple questions.
+                For dataset: the passed in dataset must be of type `transformers.pipelines.pt_utils.KeyDataset`
+                Example:
+                ```python
+                >>> from transformers.pipelines.pt_utils import KeyDataset
+                >>> from datasets import load_dataset
+
+                >>> dataset = load_dataset("detection-datasets/coco")
+                >>> oracle(image=KeyDataset(dataset, "image"), question="What's in this image?")
+
+                ```
+            question (`str`, `list[str]`):
+                The question(s) asked. If given a single question, it can be broadcasted to multiple images.
+                If multiple images and questions are given, each and every question will be broadcasted to all images
+                (same effect as a Cartesian product)
+            top_k (`int`, *optional*, defaults to 5):
+                The number of top labels that will be returned by the pipeline. If the provided number is higher than
+                the number of labels available in the model configuration, it will default to the number of labels.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+        Return:
+            A dictionary or a list of dictionaries containing the result. The dictionaries contain the following keys:
+
+            - **label** (`str`) -- The label identified by the model.
+            - **score** (`int`) -- The score attributed by the model for that label.
+        """
+        is_dataset = isinstance(image, KeyDataset)
+        is_image_batch = isinstance(image, list) and all(isinstance(item, (Image.Image, str)) for item in image)
+        is_question_batch = isinstance(question, list) and all(isinstance(item, str) for item in question)
+
+        if isinstance(image, (Image.Image, str)) and isinstance(question, str):
+            inputs = {"image": image, "question": question}
+        elif (is_image_batch or is_dataset) and isinstance(question, str):
+            inputs = [{"image": im, "question": question} for im in image]
+        elif isinstance(image, (Image.Image, str)) and is_question_batch:
+            inputs = [{"image": image, "question": q} for q in question]
+        elif (is_image_batch or is_dataset) and is_question_batch:
+            question_image_pairs = []
+            for q in question:
+                for im in image:
+                    question_image_pairs.append({"image": im, "question": q})
+            inputs = question_image_pairs
+        else:
+            """
+            Supports the following format
+            - {"image": image, "question": question}
+            - [{"image": image, "question": question}]
+            - Generator and datasets
+            """
+            inputs = image
+        results = super().__call__(inputs, **kwargs)
+        return results
+
+    def preprocess(self, inputs, padding=False, truncation=False, timeout=None):
+        image = load_image(inputs["image"], timeout=timeout)
+        model_inputs = self.tokenizer(
+            inputs["question"],
+            return_tensors=self.framework,
+            padding=padding,
+            truncation=truncation,
+        )
+        image_features = self.image_processor(images=image, return_tensors=self.framework)
+        if self.framework == "pt":
+            image_features = image_features.to(self.dtype)
+        model_inputs.update(image_features)
+        return model_inputs
+
+    def _forward(self, model_inputs, **generate_kwargs):
+        if self.model.can_generate():
+            # User-defined `generation_config` passed to the pipeline call take precedence
+            if "generation_config" not in generate_kwargs:
+                generate_kwargs["generation_config"] = self.generation_config
+
+            model_outputs = self.model.generate(**model_inputs, **generate_kwargs)
+        else:
+            model_outputs = self.model(**model_inputs)
+        return model_outputs
+
+    def postprocess(self, model_outputs, top_k=5):
+        if self.model.can_generate():
+            return [
+                {"answer": self.tokenizer.decode(output_ids, skip_special_tokens=True).strip()}
+                for output_ids in model_outputs
+            ]
+        else:
+            if top_k > self.model.config.num_labels:
+                top_k = self.model.config.num_labels
+
+            if self.framework == "pt":
+                probs = model_outputs.logits.sigmoid()[0]
+                scores, ids = probs.topk(top_k)
+            else:
+                raise ValueError(f"Unsupported framework: {self.framework}")
+
+            scores = scores.tolist()
+            ids = ids.tolist()
+            return [{"score": score, "answer": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)]
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/zero_shot_audio_classification.py b/venv/lib/python3.13/site-packages/transformers/pipelines/zero_shot_audio_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c21681a0d8ef1c93eb16bbb1740a509f091a0bb
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/zero_shot_audio_classification.py
@@ -0,0 +1,169 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import UserDict
+from typing import Any, Union
+
+import numpy as np
+import requests
+
+from ..utils import (
+    add_end_docstrings,
+    logging,
+)
+from .audio_classification import ffmpeg_read
+from .base import Pipeline, build_pipeline_init_args
+
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_feature_extractor=True, has_tokenizer=True))
+class ZeroShotAudioClassificationPipeline(Pipeline):
+    """
+    Zero shot audio classification pipeline using `ClapModel`. This pipeline predicts the class of an audio when you
+    provide an audio and a set of `candidate_labels`.
+
+    <Tip warning={true}>
+
+    The default `hypothesis_template` is : `"This is a sound of {}."`. Make sure you update it for your usage.
+
+    </Tip>
+
+    Example:
+    ```python
+    >>> from transformers import pipeline
+    >>> from datasets import load_dataset
+
+    >>> dataset = load_dataset("ashraq/esc50")
+    >>> audio = next(iter(dataset["train"]["audio"]))["array"]
+    >>> classifier = pipeline(task="zero-shot-audio-classification", model="laion/clap-htsat-unfused")
+    >>> classifier(audio, candidate_labels=["Sound of a dog", "Sound of vacuum cleaner"])
+    [{'score': 0.9996, 'label': 'Sound of a dog'}, {'score': 0.0004, 'label': 'Sound of vacuum cleaner'}]
+    ```
+
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) This audio
+    classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"zero-shot-audio-classification"`. See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-audio-classification).
+    """
+
+    _load_processor = False
+    _load_image_processor = False
+    _load_feature_extractor = True
+    _load_tokenizer = True
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        if self.framework != "pt":
+            raise ValueError(f"The {self.__class__} is only available in PyTorch.")
+        # No specific FOR_XXX available yet
+
+    def __call__(self, audios: Union[np.ndarray, bytes, str, dict], **kwargs: Any) -> list[dict[str, Any]]:
+        """
+        Assign labels to the audio(s) passed as inputs.
+
+        Args:
+            audios (`str`, `list[str]`, `np.array` or `list[np.array]`):
+                The pipeline handles three types of inputs:
+                - A string containing a http link pointing to an audio
+                - A string containing a local path to an audio
+                - An audio loaded in numpy
+            candidate_labels (`list[str]`):
+                The candidate labels for this audio. They will be formatted using *hypothesis_template*.
+            hypothesis_template (`str`, *optional*, defaults to `"This is a sound of {}"`):
+                The format used in conjunction with *candidate_labels* to attempt the audio classification by
+                replacing the placeholder with the candidate_labels. Pass "{}" if *candidate_labels* are
+                already formatted.
+        Return:
+            A list of dictionaries containing one entry per proposed label. Each dictionary contains the
+            following keys:
+            - **label** (`str`) -- One of the suggested *candidate_labels*.
+            - **score** (`float`) -- The score attributed by the model to that label. It is a value between
+                0 and 1, computed as the `softmax` of `logits_per_audio`.
+        """
+        return super().__call__(audios, **kwargs)
+
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_params = {}
+        if "candidate_labels" in kwargs:
+            preprocess_params["candidate_labels"] = kwargs["candidate_labels"]
+        if "hypothesis_template" in kwargs:
+            preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"]
+
+        return preprocess_params, {}, {}
+
+    def preprocess(self, audio, candidate_labels=None, hypothesis_template="This is a sound of {}."):
+        if isinstance(audio, str):
+            if audio.startswith("http://") or audio.startswith("https://"):
+                # We need to actually check for a real protocol, otherwise it's impossible to use a local file
+                # like http_huggingface_co.png
+                audio = requests.get(audio).content
+            else:
+                with open(audio, "rb") as f:
+                    audio = f.read()
+
+        if isinstance(audio, bytes):
+            audio = ffmpeg_read(audio, self.feature_extractor.sampling_rate)
+
+        if not isinstance(audio, np.ndarray):
+            raise TypeError("We expect a numpy ndarray as input")
+        if len(audio.shape) != 1:
+            raise ValueError("We expect a single channel audio input for ZeroShotAudioClassificationPipeline")
+
+        inputs = self.feature_extractor(
+            [audio], sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt"
+        )
+        if self.framework == "pt":
+            inputs = inputs.to(self.dtype)
+        inputs["candidate_labels"] = candidate_labels
+        sequences = [hypothesis_template.format(x) for x in candidate_labels]
+        text_inputs = self.tokenizer(sequences, return_tensors=self.framework, padding=True)
+        inputs["text_inputs"] = [text_inputs]
+        return inputs
+
+    def _forward(self, model_inputs):
+        candidate_labels = model_inputs.pop("candidate_labels")
+        text_inputs = model_inputs.pop("text_inputs")
+        if isinstance(text_inputs[0], UserDict):
+            text_inputs = text_inputs[0]
+        else:
+            # Batching case.
+            text_inputs = text_inputs[0][0]
+
+        outputs = self.model(**text_inputs, **model_inputs)
+
+        model_outputs = {
+            "candidate_labels": candidate_labels,
+            "logits": outputs.logits_per_audio,
+        }
+        return model_outputs
+
+    def postprocess(self, model_outputs):
+        candidate_labels = model_outputs.pop("candidate_labels")
+        logits = model_outputs["logits"][0]
+
+        if self.framework == "pt":
+            probs = logits.softmax(dim=0)
+            scores = probs.tolist()
+        else:
+            raise ValueError("`tf` framework not supported.")
+
+        result = [
+            {"score": score, "label": candidate_label}
+            for score, candidate_label in sorted(zip(scores, candidate_labels), key=lambda x: -x[0])
+        ]
+        return result
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/zero_shot_classification.py b/venv/lib/python3.13/site-packages/transformers/pipelines/zero_shot_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7a609bcd1677bad2b4a09fb1955073714201125
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/zero_shot_classification.py
@@ -0,0 +1,271 @@
+import inspect
+from typing import Union
+
+import numpy as np
+
+from ..tokenization_utils import TruncationStrategy
+from ..utils import add_end_docstrings, logging
+from .base import ArgumentHandler, ChunkPipeline, build_pipeline_init_args
+
+
+logger = logging.get_logger(__name__)
+
+
+class ZeroShotClassificationArgumentHandler(ArgumentHandler):
+    """
+    Handles arguments for zero-shot for text classification by turning each possible label into an NLI
+    premise/hypothesis pair.
+    """
+
+    def _parse_labels(self, labels):
+        if isinstance(labels, str):
+            labels = [label.strip() for label in labels.split(",") if label.strip()]
+        return labels
+
+    def __call__(self, sequences, labels, hypothesis_template):
+        if len(labels) == 0 or len(sequences) == 0:
+            raise ValueError("You must include at least one label and at least one sequence.")
+        if hypothesis_template.format(labels[0]) == hypothesis_template:
+            raise ValueError(
+                f'The provided hypothesis_template "{hypothesis_template}" was not able to be formatted with the target labels. '
+                "Make sure the passed template includes formatting syntax such as {} where the label should go."
+            )
+
+        if isinstance(sequences, str):
+            sequences = [sequences]
+
+        sequence_pairs = []
+        for sequence in sequences:
+            sequence_pairs.extend([[sequence, hypothesis_template.format(label)] for label in labels])
+
+        return sequence_pairs, sequences
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
+class ZeroShotClassificationPipeline(ChunkPipeline):
+    """
+    NLI-based zero-shot classification pipeline using a `ModelForSequenceClassification` trained on NLI (natural
+    language inference) tasks. Equivalent of `text-classification` pipelines, but these models don't require a
+    hardcoded number of potential classes, they can be chosen at runtime. It usually means it's slower but it is
+    **much** more flexible.
+
+    Any combination of sequences and labels can be passed and each combination will be posed as a premise/hypothesis
+    pair and passed to the pretrained model. Then, the logit for *entailment* is taken as the logit for the candidate
+    label being valid. Any NLI model can be used, but the id of the *entailment* label must be included in the model
+    config's :attr:*~transformers.PretrainedConfig.label2id*.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> oracle = pipeline(model="facebook/bart-large-mnli")
+    >>> oracle(
+    ...     "I have a problem with my iphone that needs to be resolved asap!!",
+    ...     candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"],
+    ... )
+    {'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'], 'scores': [0.504, 0.479, 0.013, 0.003, 0.002]}
+
+    >>> oracle(
+    ...     "I have a problem with my iphone that needs to be resolved asap!!",
+    ...     candidate_labels=["english", "german"],
+    ... )
+    {'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['english', 'german'], 'scores': [0.814, 0.186]}
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This NLI pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"zero-shot-classification"`.
+
+    The models that this pipeline can use are models that have been fine-tuned on an NLI task. See the up-to-date list
+    of available models on [huggingface.co/models](https://huggingface.co/models?search=nli).
+    """
+
+    _load_processor = False
+    _load_image_processor = False
+    _load_feature_extractor = False
+    _load_tokenizer = True
+
+    def __init__(self, args_parser=ZeroShotClassificationArgumentHandler(), **kwargs):
+        self._args_parser = args_parser
+        super().__init__(**kwargs)
+        if self.entailment_id == -1:
+            logger.warning(
+                "Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to "
+                "-1. Define a descriptive label2id mapping in the model config to ensure correct outputs."
+            )
+
+    @property
+    def entailment_id(self):
+        for label, ind in self.model.config.label2id.items():
+            if label.lower().startswith("entail"):
+                return ind
+        return -1
+
+    def _parse_and_tokenize(
+        self, sequence_pairs, padding=True, add_special_tokens=True, truncation=TruncationStrategy.ONLY_FIRST, **kwargs
+    ):
+        """
+        Parse arguments and tokenize only_first so that hypothesis (label) is not truncated
+        """
+        return_tensors = self.framework
+        if self.tokenizer.pad_token is None:
+            # Override for tokenizers not supporting padding
+            logger.error(
+                "Tokenizer was not supporting padding necessary for zero-shot, attempting to use "
+                " `pad_token=eos_token`"
+            )
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        try:
+            inputs = self.tokenizer(
+                sequence_pairs,
+                add_special_tokens=add_special_tokens,
+                return_tensors=return_tensors,
+                padding=padding,
+                truncation=truncation,
+            )
+        except Exception as e:
+            if "too short" in str(e):
+                # tokenizers might yell that we want to truncate
+                # to a value that is not even reached by the input.
+                # In that case we don't want to truncate.
+                # It seems there's not a really better way to catch that
+                # exception.
+
+                inputs = self.tokenizer(
+                    sequence_pairs,
+                    add_special_tokens=add_special_tokens,
+                    return_tensors=return_tensors,
+                    padding=padding,
+                    truncation=TruncationStrategy.DO_NOT_TRUNCATE,
+                )
+            else:
+                raise e
+
+        return inputs
+
+    def _sanitize_parameters(self, **kwargs):
+        if kwargs.get("multi_class") is not None:
+            kwargs["multi_label"] = kwargs["multi_class"]
+            logger.warning(
+                "The `multi_class` argument has been deprecated and renamed to `multi_label`. "
+                "`multi_class` will be removed in a future version of Transformers."
+            )
+        preprocess_params = {}
+        if "candidate_labels" in kwargs:
+            preprocess_params["candidate_labels"] = self._args_parser._parse_labels(kwargs["candidate_labels"])
+        if "hypothesis_template" in kwargs:
+            preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"]
+
+        postprocess_params = {}
+        if "multi_label" in kwargs:
+            postprocess_params["multi_label"] = kwargs["multi_label"]
+        return preprocess_params, {}, postprocess_params
+
+    def __call__(
+        self,
+        sequences: Union[str, list[str]],
+        *args,
+        **kwargs,
+    ):
+        """
+        Classify the sequence(s) given as inputs. See the [`ZeroShotClassificationPipeline`] documentation for more
+        information.
+
+        Args:
+            sequences (`str` or `list[str]`):
+                The sequence(s) to classify, will be truncated if the model input is too large.
+            candidate_labels (`str` or `list[str]`):
+                The set of possible class labels to classify each sequence into. Can be a single label, a string of
+                comma-separated labels, or a list of labels.
+            hypothesis_template (`str`, *optional*, defaults to `"This example is {}."`):
+                The template used to turn each label into an NLI-style hypothesis. This template must include a {} or
+                similar syntax for the candidate label to be inserted into the template. For example, the default
+                template is `"This example is {}."` With the candidate label `"sports"`, this would be fed into the
+                model like `"<cls> sequence to classify <sep> This example is sports . <sep>"`. The default template
+                works well in many cases, but it may be worthwhile to experiment with different templates depending on
+                the task setting.
+            multi_label (`bool`, *optional*, defaults to `False`):
+                Whether or not multiple candidate labels can be true. If `False`, the scores are normalized such that
+                the sum of the label likelihoods for each sequence is 1. If `True`, the labels are considered
+                independent and probabilities are normalized for each candidate by doing a softmax of the entailment
+                score vs. the contradiction score.
+
+        Return:
+            A `dict` or a list of `dict`: Each result comes as a dictionary with the following keys:
+
+            - **sequence** (`str`) -- The sequence for which this is the output.
+            - **labels** (`list[str]`) -- The labels sorted by order of likelihood.
+            - **scores** (`list[float]`) -- The probabilities for each of the labels.
+        """
+        if len(args) == 0:
+            pass
+        elif len(args) == 1 and "candidate_labels" not in kwargs:
+            kwargs["candidate_labels"] = args[0]
+        else:
+            raise ValueError(f"Unable to understand extra arguments {args}")
+
+        return super().__call__(sequences, **kwargs)
+
+    def preprocess(self, inputs, candidate_labels=None, hypothesis_template="This example is {}."):
+        sequence_pairs, sequences = self._args_parser(inputs, candidate_labels, hypothesis_template)
+
+        for i, (candidate_label, sequence_pair) in enumerate(zip(candidate_labels, sequence_pairs)):
+            model_input = self._parse_and_tokenize([sequence_pair])
+
+            yield {
+                "candidate_label": candidate_label,
+                "sequence": sequences[0],
+                "is_last": i == len(candidate_labels) - 1,
+                **model_input,
+            }
+
+    def _forward(self, inputs):
+        candidate_label = inputs["candidate_label"]
+        sequence = inputs["sequence"]
+        model_inputs = {k: inputs[k] for k in self.tokenizer.model_input_names}
+        # `XXXForSequenceClassification` models should not use `use_cache=True` even if it's supported
+        model_forward = self.model.forward if self.framework == "pt" else self.model.call
+        if "use_cache" in inspect.signature(model_forward).parameters:
+            model_inputs["use_cache"] = False
+        outputs = self.model(**model_inputs)
+
+        model_outputs = {
+            "candidate_label": candidate_label,
+            "sequence": sequence,
+            "is_last": inputs["is_last"],
+            **outputs,
+        }
+        return model_outputs
+
+    def postprocess(self, model_outputs, multi_label=False):
+        candidate_labels = [outputs["candidate_label"] for outputs in model_outputs]
+        sequences = [outputs["sequence"] for outputs in model_outputs]
+        if self.framework == "pt":
+            logits = np.concatenate([output["logits"].float().numpy() for output in model_outputs])
+        else:
+            logits = np.concatenate([output["logits"].numpy() for output in model_outputs])
+        N = logits.shape[0]
+        n = len(candidate_labels)
+        num_sequences = N // n
+        reshaped_outputs = logits.reshape((num_sequences, n, -1))
+
+        if multi_label or len(candidate_labels) == 1:
+            # softmax over the entailment vs. contradiction dim for each label independently
+            entailment_id = self.entailment_id
+            contradiction_id = -1 if entailment_id == 0 else 0
+            entail_contr_logits = reshaped_outputs[..., [contradiction_id, entailment_id]]
+            scores = np.exp(entail_contr_logits) / np.exp(entail_contr_logits).sum(-1, keepdims=True)
+            scores = scores[..., 1]
+        else:
+            # softmax the "entailment" logits over all candidate labels
+            entail_logits = reshaped_outputs[..., self.entailment_id]
+            scores = np.exp(entail_logits) / np.exp(entail_logits).sum(-1, keepdims=True)
+
+        top_inds = list(reversed(scores[0].argsort()))
+        return {
+            "sequence": sequences[0],
+            "labels": [candidate_labels[i] for i in top_inds],
+            "scores": scores[0, top_inds].tolist(),
+        }
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/zero_shot_image_classification.py b/venv/lib/python3.13/site-packages/transformers/pipelines/zero_shot_image_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..6aeb9162030689b989fa6e1eabf4445dc9471372
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/zero_shot_image_classification.py
@@ -0,0 +1,216 @@
+import warnings
+from collections import UserDict
+from typing import Any, Union, overload
+
+from ..utils import (
+    add_end_docstrings,
+    is_tf_available,
+    is_torch_available,
+    is_vision_available,
+    logging,
+    requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
+
+if is_tf_available():
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
+    from ..tf_utils import stable_softmax
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
+class ZeroShotImageClassificationPipeline(Pipeline):
+    """
+    Zero shot image classification pipeline using `CLIPModel`. This pipeline predicts the class of an image when you
+    provide an image and a set of `candidate_labels`.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> classifier = pipeline(model="google/siglip-so400m-patch14-384")
+    >>> classifier(
+    ...     "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
+    ...     candidate_labels=["animals", "humans", "landscape"],
+    ... )
+    [{'score': 0.965, 'label': 'animals'}, {'score': 0.03, 'label': 'humans'}, {'score': 0.005, 'label': 'landscape'}]
+
+    >>> classifier(
+    ...     "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
+    ...     candidate_labels=["black and white", "photorealist", "painting"],
+    ... )
+    [{'score': 0.996, 'label': 'black and white'}, {'score': 0.003, 'label': 'photorealist'}, {'score': 0.0, 'label': 'painting'}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This image classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"zero-shot-image-classification"`.
+
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-image-classification).
+    """
+
+    _load_processor = False
+    _load_image_processor = True
+    _load_feature_extractor = False
+    _load_tokenizer = True
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        requires_backends(self, "vision")
+        self.check_model_type(
+            TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
+            if self.framework == "tf"
+            else MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
+        )
+
+    @overload
+    def __call__(
+        self, image: Union[str, "Image.Image"], candidate_labels: list[str], **kwargs: Any
+    ) -> list[dict[str, Any]]: ...
+
+    @overload
+    def __call__(
+        self, image: Union[list[str], list["Image.Image"]], candidate_labels: list[str], **kwargs: Any
+    ) -> list[list[dict[str, Any]]]: ...
+
+    def __call__(
+        self,
+        image: Union[str, list[str], "Image.Image", list["Image.Image"]],
+        candidate_labels: list[str],
+        **kwargs: Any,
+    ) -> Union[list[dict[str, Any]], list[list[dict[str, Any]]]]:
+        """
+        Assign labels to the image(s) passed as inputs.
+
+        Args:
+            image (`str`, `list[str]`, `PIL.Image` or `list[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing a http link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+            candidate_labels (`list[str]`):
+                The candidate labels for this image. They will be formatted using *hypothesis_template*.
+
+            hypothesis_template (`str`, *optional*, defaults to `"This is a photo of {}"`):
+                The format used in conjunction with *candidate_labels* to attempt the image classification by
+                replacing the placeholder with the candidate_labels. Pass "{}" if *candidate_labels* are
+                already formatted.
+
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
+        Return:
+            A list of dictionaries containing one entry per proposed label. Each dictionary contains the
+            following keys:
+            - **label** (`str`) -- One of the suggested *candidate_labels*.
+            - **score** (`float`) -- The score attributed by the model to that label. It is a value between
+                0 and 1, computed as the `softmax` of `logits_per_image`.
+        """
+        # After deprecation of this is completed, remove the default `None` value for `image`
+        if "images" in kwargs:
+            image = kwargs.pop("images")
+        if image is None:
+            raise ValueError("Cannot call the zero-shot-image-classification pipeline without an images argument!")
+        return super().__call__(image, candidate_labels=candidate_labels, **kwargs)
+
+    def _sanitize_parameters(self, tokenizer_kwargs=None, **kwargs):
+        preprocess_params = {}
+        if "candidate_labels" in kwargs:
+            preprocess_params["candidate_labels"] = kwargs["candidate_labels"]
+        if "timeout" in kwargs:
+            preprocess_params["timeout"] = kwargs["timeout"]
+        if "hypothesis_template" in kwargs:
+            preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"]
+        if tokenizer_kwargs is not None:
+            warnings.warn(
+                "The `tokenizer_kwargs` argument is deprecated and will be removed in version 5 of Transformers",
+                FutureWarning,
+            )
+            preprocess_params["tokenizer_kwargs"] = tokenizer_kwargs
+
+        return preprocess_params, {}, {}
+
+    def preprocess(
+        self,
+        image,
+        candidate_labels=None,
+        hypothesis_template="This is a photo of {}.",
+        timeout=None,
+        tokenizer_kwargs=None,
+    ):
+        if tokenizer_kwargs is None:
+            tokenizer_kwargs = {}
+        image = load_image(image, timeout=timeout)
+        inputs = self.image_processor(images=[image], return_tensors=self.framework)
+        if self.framework == "pt":
+            inputs = inputs.to(self.dtype)
+        inputs["candidate_labels"] = candidate_labels
+        sequences = [hypothesis_template.format(x) for x in candidate_labels]
+        tokenizer_default_kwargs = {"padding": True}
+        if "siglip" in self.model.config.model_type:
+            tokenizer_default_kwargs.update(padding="max_length", max_length=64, truncation=True)
+        tokenizer_default_kwargs.update(tokenizer_kwargs)
+        text_inputs = self.tokenizer(sequences, return_tensors=self.framework, **tokenizer_default_kwargs)
+        inputs["text_inputs"] = [text_inputs]
+        return inputs
+
+    def _forward(self, model_inputs):
+        candidate_labels = model_inputs.pop("candidate_labels")
+        text_inputs = model_inputs.pop("text_inputs")
+        if isinstance(text_inputs[0], UserDict):
+            text_inputs = text_inputs[0]
+        else:
+            # Batching case.
+            text_inputs = text_inputs[0][0]
+
+        outputs = self.model(**text_inputs, **model_inputs)
+
+        model_outputs = {
+            "candidate_labels": candidate_labels,
+            "logits": outputs.logits_per_image,
+        }
+        return model_outputs
+
+    def postprocess(self, model_outputs):
+        candidate_labels = model_outputs.pop("candidate_labels")
+        logits = model_outputs["logits"][0]
+        if self.framework == "pt" and "siglip" in self.model.config.model_type:
+            probs = torch.sigmoid(logits).squeeze(-1)
+            scores = probs.tolist()
+            if not isinstance(scores, list):
+                scores = [scores]
+        elif self.framework == "pt":
+            probs = logits.softmax(dim=-1).squeeze(-1)
+            scores = probs.tolist()
+            if not isinstance(scores, list):
+                scores = [scores]
+        elif self.framework == "tf":
+            probs = stable_softmax(logits, axis=-1)
+            scores = probs.numpy().tolist()
+        else:
+            raise ValueError(f"Unsupported framework: {self.framework}")
+
+        result = [
+            {"score": score, "label": candidate_label}
+            for score, candidate_label in sorted(zip(scores, candidate_labels), key=lambda x: -x[0])
+        ]
+        return result
diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/zero_shot_object_detection.py b/venv/lib/python3.13/site-packages/transformers/pipelines/zero_shot_object_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..55154af9ab3b30e93adf904e25b13b4930b5aa53
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/pipelines/zero_shot_object_detection.py
@@ -0,0 +1,248 @@
+from typing import Any, Optional, Union, overload
+
+from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends
+from .base import ChunkPipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image, valid_images
+
+if is_torch_available():
+    import torch
+
+    from transformers.modeling_outputs import BaseModelOutput
+
+    from ..models.auto.modeling_auto import MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
+class ZeroShotObjectDetectionPipeline(ChunkPipeline):
+    """
+    Zero shot object detection pipeline using `OwlViTForObjectDetection`. This pipeline predicts bounding boxes of
+    objects when you provide an image and a set of `candidate_labels`.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> detector = pipeline(model="google/owlvit-base-patch32", task="zero-shot-object-detection")
+    >>> detector(
+    ...     "http://images.cocodataset.org/val2017/000000039769.jpg",
+    ...     candidate_labels=["cat", "couch"],
+    ... )
+    [{'score': 0.287, 'label': 'cat', 'box': {'xmin': 324, 'ymin': 20, 'xmax': 640, 'ymax': 373}}, {'score': 0.254, 'label': 'cat', 'box': {'xmin': 1, 'ymin': 55, 'xmax': 315, 'ymax': 472}}, {'score': 0.121, 'label': 'couch', 'box': {'xmin': 4, 'ymin': 0, 'xmax': 642, 'ymax': 476}}]
+
+    >>> detector(
+    ...     "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
+    ...     candidate_labels=["head", "bird"],
+    ... )
+    [{'score': 0.119, 'label': 'bird', 'box': {'xmin': 71, 'ymin': 170, 'xmax': 410, 'ymax': 508}}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This object detection pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"zero-shot-object-detection"`.
+
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-object-detection).
+    """
+
+    _load_processor = False
+    _load_image_processor = True
+    _load_feature_extractor = False
+    _load_tokenizer = True
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        if self.framework == "tf":
+            raise ValueError(f"The {self.__class__} is only available in PyTorch.")
+
+        requires_backends(self, "vision")
+        self.check_model_type(MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES)
+
+    @overload
+    def __call__(
+        self, image: Union[str, "Image.Image"], candidate_labels: Union[str, list[str]], **kwargs: Any
+    ) -> list[dict[str, Any]]: ...
+
+    @overload
+    def __call__(self, image: list[dict[str, Any]], **kwargs: Any) -> list[list[dict[str, Any]]]: ...
+
+    def __call__(
+        self,
+        image: Union[str, "Image.Image", list[dict[str, Any]]],
+        candidate_labels: Optional[Union[str, list[str]]] = None,
+        **kwargs: Any,
+    ) -> Union[list[dict[str, Any]], list[list[dict[str, Any]]]]:
+        """
+        Detect objects (bounding boxes & classes) in the image(s) passed as inputs.
+
+        Args:
+            image (`str`, `PIL.Image` or `list[dict[str, Any]]`):
+                The pipeline handles three types of images:
+
+                - A string containing an http url pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                You can use this parameter to send directly a list of images, or a dataset or a generator like so:
+
+                ```python
+                >>> from transformers import pipeline
+
+                >>> detector = pipeline(model="google/owlvit-base-patch32", task="zero-shot-object-detection")
+                >>> detector(
+                ...     [
+                ...         {
+                ...             "image": "http://images.cocodataset.org/val2017/000000039769.jpg",
+                ...             "candidate_labels": ["cat", "couch"],
+                ...         },
+                ...         {
+                ...             "image": "http://images.cocodataset.org/val2017/000000039769.jpg",
+                ...             "candidate_labels": ["cat", "couch"],
+                ...         },
+                ...     ]
+                ... )
+                [[{'score': 0.287, 'label': 'cat', 'box': {'xmin': 324, 'ymin': 20, 'xmax': 640, 'ymax': 373}}, {'score': 0.25, 'label': 'cat', 'box': {'xmin': 1, 'ymin': 55, 'xmax': 315, 'ymax': 472}}, {'score': 0.121, 'label': 'couch', 'box': {'xmin': 4, 'ymin': 0, 'xmax': 642, 'ymax': 476}}], [{'score': 0.287, 'label': 'cat', 'box': {'xmin': 324, 'ymin': 20, 'xmax': 640, 'ymax': 373}}, {'score': 0.254, 'label': 'cat', 'box': {'xmin': 1, 'ymin': 55, 'xmax': 315, 'ymax': 472}}, {'score': 0.121, 'label': 'couch', 'box': {'xmin': 4, 'ymin': 0, 'xmax': 642, 'ymax': 476}}]]
+                ```
+
+
+            candidate_labels (`str` or `list[str]` or `list[list[str]]`):
+                What the model should recognize in the image.
+
+            threshold (`float`, *optional*, defaults to 0.1):
+                The probability necessary to make a prediction.
+
+            top_k (`int`, *optional*, defaults to None):
+                The number of top predictions that will be returned by the pipeline. If the provided number is `None`
+                or higher than the number of predictions available, it will default to the number of predictions.
+
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
+
+        Return:
+            A list of lists containing prediction results, one list per input image. Each list contains dictionaries
+            with the following keys:
+
+            - **label** (`str`) -- Text query corresponding to the found object.
+            - **score** (`float`) -- Score corresponding to the object (between 0 and 1).
+            - **box** (`dict[str,int]`) -- Bounding box of the detected object in image's original size. It is a
+              dictionary with `x_min`, `x_max`, `y_min`, `y_max` keys.
+        """
+        if "text_queries" in kwargs:
+            candidate_labels = kwargs.pop("text_queries")
+
+        if isinstance(image, (str, Image.Image)):
+            inputs = {"image": image, "candidate_labels": candidate_labels}
+        elif isinstance(image, (list, tuple)) and valid_images(image):
+            return list(
+                super().__call__(
+                    ({"image": img, "candidate_labels": labels} for img, labels in zip(image, candidate_labels)),
+                    **kwargs,
+                )
+            )
+        else:
+            """
+            Supports the following format
+            - {"image": image, "candidate_labels": candidate_labels}
+            - [{"image": image, "candidate_labels": candidate_labels}]
+            - Generator and datasets
+            This is a common pattern in other multimodal pipelines, so we support it here as well.
+            """
+            inputs = image
+
+        results = super().__call__(inputs, **kwargs)
+        return results
+
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_params = {}
+        if "timeout" in kwargs:
+            preprocess_params["timeout"] = kwargs["timeout"]
+        postprocess_params = {}
+        if "threshold" in kwargs:
+            postprocess_params["threshold"] = kwargs["threshold"]
+        if "top_k" in kwargs:
+            postprocess_params["top_k"] = kwargs["top_k"]
+        return preprocess_params, {}, postprocess_params
+
+    def preprocess(self, inputs, timeout=None):
+        image = load_image(inputs["image"], timeout=timeout)
+        candidate_labels = inputs["candidate_labels"]
+        if isinstance(candidate_labels, str):
+            candidate_labels = candidate_labels.split(",")
+
+        target_size = torch.tensor([[image.height, image.width]], dtype=torch.int32)
+        for i, candidate_label in enumerate(candidate_labels):
+            text_inputs = self.tokenizer(candidate_label, return_tensors=self.framework)
+            image_features = self.image_processor(image, return_tensors=self.framework)
+            if self.framework == "pt":
+                image_features = image_features.to(self.dtype)
+            yield {
+                "is_last": i == len(candidate_labels) - 1,
+                "target_size": target_size,
+                "candidate_label": candidate_label,
+                **text_inputs,
+                **image_features,
+            }
+
+    def _forward(self, model_inputs):
+        target_size = model_inputs.pop("target_size")
+        candidate_label = model_inputs.pop("candidate_label")
+        is_last = model_inputs.pop("is_last")
+
+        outputs = self.model(**model_inputs)
+
+        model_outputs = {"target_size": target_size, "candidate_label": candidate_label, "is_last": is_last, **outputs}
+        return model_outputs
+
+    def postprocess(self, model_outputs, threshold=0.1, top_k=None):
+        results = []
+        for model_output in model_outputs:
+            label = model_output["candidate_label"]
+            model_output = BaseModelOutput(model_output)
+            outputs = self.image_processor.post_process_object_detection(
+                outputs=model_output, threshold=threshold, target_sizes=model_output["target_size"]
+            )[0]
+
+            for index in outputs["scores"].nonzero():
+                score = outputs["scores"][index].item()
+                box = self._get_bounding_box(outputs["boxes"][index][0])
+
+                result = {"score": score, "label": label, "box": box}
+                results.append(result)
+
+        results = sorted(results, key=lambda x: x["score"], reverse=True)
+        if top_k:
+            results = results[:top_k]
+
+        return results
+
+    def _get_bounding_box(self, box: "torch.Tensor") -> dict[str, int]:
+        """
+        Turns list [xmin, xmax, ymin, ymax] into dict { "xmin": xmin, ... }
+
+        Args:
+            box (`torch.Tensor`): Tensor containing the coordinates in corners format.
+
+        Returns:
+            bbox (`dict[str, int]`): Dict containing the coordinates in corners format.
+        """
+        if self.framework != "pt":
+            raise ValueError("The ZeroShotObjectDetectionPipeline is only available in PyTorch.")
+        xmin, ymin, xmax, ymax = box.int().tolist()
+        bbox = {
+            "xmin": xmin,
+            "ymin": ymin,
+            "xmax": xmax,
+            "ymax": ymax,
+        }
+        return bbox