diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b8e87b4a137aa65b8a72a5c7d01b953929bfb628 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/__init__.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/activations.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/activations.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dd855bfac044fe2c98c9012429fb6f9f3e319f45 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/activations.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/activations_tf.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/activations_tf.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7332e4a718d08c3b50d33a721d705a0008425c0f Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/activations_tf.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/audio_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/audio_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0ef29f48db46d6e01a09e343bb481f55c0005e6a Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/audio_utils.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/cache_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/cache_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5fb3988bda895c1cc595178a79ec38f65e6198b7 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/cache_utils.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/configuration_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/configuration_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0300af3d7904cb8d5895d0a182befd631f612ddd Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/configuration_utils.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/convert_graph_to_onnx.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/convert_graph_to_onnx.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4f2cf06b1beac93b60f3504d74941cac52795be2 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/convert_graph_to_onnx.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/convert_slow_tokenizer.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/convert_slow_tokenizer.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..12403d6eb80e7033e73a8df1d9a5cd8e45fc1c67 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/convert_slow_tokenizer.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/convert_slow_tokenizers_checkpoints_to_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/convert_slow_tokenizers_checkpoints_to_fast.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1bf5d62e942d0f943af67694dfdeebd72b9a5cc6 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/convert_slow_tokenizers_checkpoints_to_fast.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/convert_tf_hub_seq_to_seq_bert_to_pytorch.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/convert_tf_hub_seq_to_seq_bert_to_pytorch.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c2de23088dc4f2d30404399e106707ccab540396 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/convert_tf_hub_seq_to_seq_bert_to_pytorch.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/debug_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/debug_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7d69010a487542bd168dcc67427d1c616cb16eac Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/debug_utils.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/dependency_versions_check.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/dependency_versions_check.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..35d316aecd9abd49bf0b45d8c0e74a89a836dccf Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/dependency_versions_check.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/dependency_versions_table.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/dependency_versions_table.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0d0f3f76f523140a25f0528470174cc58cec5424 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/dependency_versions_table.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/dynamic_module_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/dynamic_module_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..557b10f21644471a17d7328a5724ae54efa35d43 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/dynamic_module_utils.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/feature_extraction_sequence_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/feature_extraction_sequence_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7cdf433c024152d00847d137956ec4920e0508ad Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/feature_extraction_sequence_utils.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/feature_extraction_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/feature_extraction_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2de0c8c75c3dcb605d2616f83ee29510a812c55d Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/feature_extraction_utils.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/file_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/file_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d6f182409798d0572c42c511f9cc5590274fbf8b Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/file_utils.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/hf_argparser.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/hf_argparser.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ff9e3e0b8e40fe69d8c8d81f12167d2593e682b5 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/hf_argparser.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/hyperparameter_search.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/hyperparameter_search.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0e084c4dae40cdeecf047cf37882259fd3dc5f40 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/hyperparameter_search.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/image_processing_base.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/image_processing_base.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..954fde590d49eea739aa25c82d3974eaf9f25308 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/image_processing_base.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/image_processing_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/image_processing_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4e8a2167dd72a07a49dc171533c1ab0979f2cc8e Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/image_processing_utils.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/image_processing_utils_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/image_processing_utils_fast.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0474fe81ac948426d15aeb01d58e497edece678b Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/image_processing_utils_fast.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/image_transforms.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/image_transforms.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1055323545f8129aac4ccfbdb5c457a077ab67cd Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/image_transforms.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/image_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/image_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f813a26cc6423e5205e7373ed1c01dee46518580 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/image_utils.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/keras_callbacks.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/keras_callbacks.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..96d8463dda99ea58a949d427cefa08a7b45906e9 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/keras_callbacks.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/masking_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/masking_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6815f0077cb9303f2a7f1efe9ac366c956e16667 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/masking_utils.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/model_debugging_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/model_debugging_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d94567e5ca30258dc158f5fe87896f6ce1fadb7 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/model_debugging_utils.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/modelcard.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/modelcard.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..013c00b9d377dba5b5bbdd75a3ecf5de105aef21 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/modelcard.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_attn_mask_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_attn_mask_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e1be555d2aaa4fb976ccb9d00191c27bfba84e92 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_attn_mask_utils.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_flash_attention_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_flash_attention_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..95f2614929edb90fc51afe1ff0bff08967c9b07c Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_flash_attention_utils.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_flax_outputs.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_flax_outputs.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e4f8801f82aed477c4ed04aa1c357a9fd34c4cec Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_flax_outputs.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_flax_pytorch_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_flax_pytorch_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a8ac6c2eaadba3e02bf41d07c260726586f1b42e Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_flax_pytorch_utils.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_flax_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_flax_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..00ce6222c036a09c2aefc2ba359b6736f092f627 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_flax_utils.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_gguf_pytorch_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_gguf_pytorch_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b4b21a43430e2e0734a66e59c82069fe45af30c5 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_gguf_pytorch_utils.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_layers.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_layers.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1f67692320f5e33592a134f833ac04394553166a Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_layers.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_rope_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_rope_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..01235171338fdf419ddcb61ab4327c4557d9b525 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_rope_utils.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_tf_outputs.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_tf_outputs.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..10c980a59acdfe93bbfb46f24b7eb8fbd5493d97 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_tf_outputs.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_tf_pytorch_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_tf_pytorch_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4fe4ea646fffacb6c7584a1b73fb62471d58c520 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/modeling_tf_pytorch_utils.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/optimization.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/optimization.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4ceb3e6c34d9fc581c6f7e5fe7f3e149411fa77f Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/optimization.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/optimization_tf.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/optimization_tf.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7371fe17471feb4cd6e304bad011eb1aea384088 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/optimization_tf.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/processing_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/processing_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cd2577439496fdec543765925b257a3738ec88d4 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/processing_utils.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/pytorch_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/pytorch_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5fddd62f11c9b8d736f7c09c3f2f0c88dfacc669 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/pytorch_utils.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/safetensors_conversion.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/safetensors_conversion.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..783ed488a72e16c1fe99f9240e33dfa140ac17d5 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/safetensors_conversion.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/tf_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/tf_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4d9194ee8bd6d0582bc4fcf7b09c993d0c91135d Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/tf_utils.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/time_series_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/time_series_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eba50ba342e8798c653eb1fb5756cdc8de88d068 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/time_series_utils.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/tokenization_mistral_common.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/tokenization_mistral_common.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b892f2b57b0c5c5ceb5221a8779352e8b3264fda Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/tokenization_mistral_common.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/tokenization_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/tokenization_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..27e8a48e747c9113a7b6aafda4a06405146b5694 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/tokenization_utils.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/tokenization_utils_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/tokenization_utils_fast.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..376ad50ab6727c7099a5affa5de5c02bdf8930db Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/tokenization_utils_fast.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/trainer_callback.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/trainer_callback.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..27ba81e79d7ff0b2bd86f412e58625c42cf27cbf Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/trainer_callback.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/trainer_pt_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/trainer_pt_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..98c248a3086440a0535f003fc3763452d6901cf4 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/trainer_pt_utils.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/trainer_seq2seq.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/trainer_seq2seq.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ec40f23d493317bb1e8f5d5e3ad17318f805159e Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/trainer_seq2seq.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/trainer_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/trainer_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b5e7e8432ca894efaffe344b8cbabac14139858 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/trainer_utils.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/training_args_seq2seq.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/training_args_seq2seq.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..092fa244d75df8a56640b496ede2bbdd9978e182 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/training_args_seq2seq.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/training_args_tf.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/training_args_tf.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aeec503e387fc41754b21dcfa715eef51d35ca2a Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/training_args_tf.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/video_processing_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/video_processing_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e804db12caa0a0f6cc0e711a6ca0fd5c25fa2aca Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/video_processing_utils.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/__pycache__/video_utils.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/__pycache__/video_utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cabfa90a3d46175c70e525951da889949df1f629 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/__pycache__/video_utils.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/commands/__init__.py b/venv/lib/python3.13/site-packages/transformers/commands/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..aa5d95a85b538171ec9cf4fa16e892df1efdef6b --- /dev/null +++ b/venv/lib/python3.13/site-packages/transformers/commands/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from argparse import ArgumentParser + + +class BaseTransformersCLICommand(ABC): + @staticmethod + @abstractmethod + def register_subcommand(parser: ArgumentParser): + raise NotImplementedError() + + @abstractmethod + def run(self): + raise NotImplementedError() diff --git a/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d2cfa28ca1cdce05d27964c83d69078cf64a115a Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/__init__.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/add_fast_image_processor.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/add_fast_image_processor.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ffe6f0dac500dbe809b819e11b88eb637fe775b Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/add_fast_image_processor.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/add_new_model_like.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/add_new_model_like.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..42bec93cc9bc2f292f10a427524f48fbba9addca Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/add_new_model_like.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/chat.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/chat.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1b8e0860e0272d8d32cef586a49c9ffd7d46866e Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/chat.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/convert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/convert.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e55289ea527ec95b3b3d9670d1dbe470177a6592 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/convert.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/download.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/download.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..84917057dcafc408a603fafff801f116bfff920c Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/download.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/env.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/env.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c148d33ec0cf8aa881eb3dc6c5a381bad907f29e Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/env.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/run.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/run.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..42a03537b2f8eaee0535bc2aed9ea224c6bc2176 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/run.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/serving.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/serving.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e8c6a7d1d448bac755663f7a7c965598aa78342b Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/serving.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/train.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/train.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3506030f35ea048ec019a07ef2c770a558ff39d6 Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/train.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/transformers_cli.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/transformers_cli.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c7e1c473f803f80e023d06fe2de930484c4ea02e Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/commands/__pycache__/transformers_cli.cpython-313.pyc differ diff --git a/venv/lib/python3.13/site-packages/transformers/commands/add_fast_image_processor.py b/venv/lib/python3.13/site-packages/transformers/commands/add_fast_image_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..90c911525f7773a34ca9f97f180d5b127649f3c2 --- /dev/null +++ b/venv/lib/python3.13/site-packages/transformers/commands/add_fast_image_processor.py @@ -0,0 +1,530 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +from argparse import ArgumentParser, Namespace +from datetime import date +from pathlib import Path + +from ..utils import logging +from . import BaseTransformersCLICommand + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +CURRENT_YEAR = date.today().year +TRANSFORMERS_PATH = Path(__file__).parent.parent +REPO_PATH = TRANSFORMERS_PATH.parent.parent + + +def add_fast_image_processor_to_model_init( + fast_image_processing_module_file: str, fast_image_processor_name, model_name: str +): + """ + Add the fast image processor to the __init__.py file of the model. + """ + with open(TRANSFORMERS_PATH / "models" / model_name / "__init__.py", "r", encoding="utf-8") as f: + content = f.read() + + fast_image_processing_module_file = fast_image_processing_module_file.split(os.sep)[-1].replace(".py", "") + + if "import *" in content: + # we have an init file in the updated format + # get the indented block after if TYPE_CHECKING: and before else:, append the new import, sort the imports and write the updated content + # Step 1: Find the block + block_regex = re.compile( + r"if TYPE_CHECKING:\n(?P.*?)(?=\s*else:)", + re.DOTALL, + ) + match = block_regex.search(content) + + if not match: + raise ValueError("Couldn't find the 'if TYPE_CHECKING' block.") + + block_content = match.group("if_block") # The captured import block + + # Step 2: Parse existing entries + entries = block_content.split("\n") + indent = " " * (len(entries[0]) - len(entries[0].lstrip())) + new_entry = f"{indent}from .{fast_image_processing_module_file} import *" + if new_entry not in entries: + entries.append(new_entry) + entries.sort() + updated_block = "\n".join(entry for entry in entries) + + # Replace the original block in the content + updated_content = content[: match.start("if_block")] + updated_block + content[match.end("if_block") :] + else: + # we have an init file in the old format + + # add "is_torchvision_available" import to from ...utils import ( + # Regex to match import statements from transformers.utils + pattern = r""" + from\s+\.\.\.utils\s+import\s+ + (?: # Non-capturing group for either: + ([\w, ]+) # 1. Single-line imports (e.g., 'a, b') + | # OR + \((.*?)\) # 2. Multi-line imports (e.g., '(a, ... b)') + ) + """ + regex = re.compile(pattern, re.VERBOSE | re.DOTALL) + + def replacement_function(match): + # Extract existing imports + imports = (match.group(1) or match.group(2)).split(",") + imports = imports[:-1] if imports[-1] == "\n" else imports + imports = [imp.strip() for imp in imports] + + # Add the new import if not already present + if "is_torchvision_available" not in imports: + imports.append("is_torchvision_available") + imports.sort() + + # Convert to multi-line import in all cases + updated_imports = "(\n " + ",\n ".join(imports) + ",\n)" + + return f"from ...utils import {updated_imports}" + + # Replace all matches in the file content + updated_content = regex.sub(replacement_function, content) + + vision_import_structure_block = f' _import_structure["{fast_image_processing_module_file[:-5]}"] = ["{fast_image_processor_name[:-4]}"]\n' + + added_import_structure_block = ( + "try:\n if not is_torchvision_available():\n" + " raise OptionalDependencyNotAvailable()\n" + "except OptionalDependencyNotAvailable:\n" + " pass\n" + "else:\n" + f' _import_structure["{fast_image_processing_module_file}"] = ["{fast_image_processor_name}"]\n' + ) + + if vision_import_structure_block not in updated_content: + raise ValueError("Couldn't find the 'vision _import_structure block' block.") + + if added_import_structure_block not in updated_content: + updated_content = updated_content.replace( + vision_import_structure_block, vision_import_structure_block + "\n" + added_import_structure_block + ) + + vision_import_statement_block = ( + f" from .{fast_image_processing_module_file[:-5]} import {fast_image_processor_name[:-4]}\n" + ) + + added_import_statement_block = ( + " try:\n if not is_torchvision_available():\n" + " raise OptionalDependencyNotAvailable()\n" + " except OptionalDependencyNotAvailable:\n" + " pass\n" + " else:\n" + f" from .{fast_image_processing_module_file} import {fast_image_processor_name}\n" + ) + + if vision_import_statement_block not in updated_content: + raise ValueError("Couldn't find the 'vision _import_structure block' block.") + + if added_import_statement_block not in updated_content: + updated_content = updated_content.replace( + vision_import_statement_block, vision_import_statement_block + "\n" + added_import_statement_block + ) + + # write the updated content + with open(TRANSFORMERS_PATH / "models" / model_name / "__init__.py", "w", encoding="utf-8") as f: + f.write(updated_content) + + +def add_fast_image_processor_to_auto(image_processor_name: str, fast_image_processor_name: str): + """ + Add the fast image processor to the auto module. + """ + with open(TRANSFORMERS_PATH / "models" / "auto" / "image_processing_auto.py", "r", encoding="utf-8") as f: + content = f.read() + + # get all lines containing the image processor name + updated_content = content.replace( + f'("{image_processor_name}",)', f'("{image_processor_name}", "{fast_image_processor_name}")' + ) + + # write the updated content + with open(TRANSFORMERS_PATH / "models" / "auto" / "image_processing_auto.py", "w", encoding="utf-8") as f: + f.write(updated_content) + + +def add_fast_image_processor_to_doc(fast_image_processor_name: str, model_name: str): + """ + Add the fast image processor to the model's doc file. + """ + doc_source = REPO_PATH / "docs" / "source" + # find the doc files + doc_files = list(doc_source.glob(f"*/model_doc/{model_name}.md")) + if not doc_files: + # try again with "-" + doc_files = list(doc_source.glob(f"*/model_doc/{model_name.replace('_', '-')}.md")) + if not doc_files: + raise ValueError(f"No doc files found for {model_name}") + + base_doc_string = ( + f"## {fast_image_processor_name[:-4]}\n\n[[autodoc]] {fast_image_processor_name[:-4]}\n - preprocess" + ) + fast_doc_string = f"## {fast_image_processor_name}\n\n[[autodoc]] {fast_image_processor_name}\n - preprocess" + + for doc_file in doc_files: + with open(doc_file, "r", encoding="utf-8") as f: + content = f.read() + + if fast_doc_string not in content: + # add the fast image processor to the doc + updated_content = content.replace( + base_doc_string, + base_doc_string + "\n\n" + fast_doc_string, + ) + + # write the updated content + with open(doc_file, "w", encoding="utf-8") as f: + f.write(updated_content) + + +def add_fast_image_processor_to_tests(fast_image_processor_name: str, model_name: str): + """ + Add the fast image processor to the image processing tests. + """ + tests_path = REPO_PATH / "tests" / "models" / model_name + test_file = tests_path / f"test_image_processing_{model_name}.py" + if not os.path.exists(test_file): + logger.warning(f"No test file found for {model_name}. Skipping.") + return + + with open(test_file, "r", encoding="utf-8") as f: + content = f.read() + + # add is_torchvision_available import to the imports + # Regex to match import statements from transformers.utils + pattern = r""" + from\s+transformers\.utils\s+import\s+ + (?: # Non-capturing group for either: + ([\w, ]+) # 1. Single-line imports (e.g., 'a, b') + | # OR + \((.*?)\) # 2. Multi-line imports (e.g., '(a, ... b)') + ) + """ + regex = re.compile(pattern, re.VERBOSE | re.DOTALL) + + def replacement_function(match): + # Extract existing imports + existing_imports = (match.group(1) or match.group(2)).split(",") + existing_imports = existing_imports[:-1] if existing_imports[-1] == "\n" else existing_imports + existing_imports = [imp.strip() for imp in existing_imports] + + # Add the new import if not already present + if "is_torchvision_available" not in existing_imports: + existing_imports.append("is_torchvision_available") + existing_imports.sort() + + # Rebuild the import statement + if match.group(1): # Single-line import + updated_imports = ", ".join(existing_imports) + else: # Multi-line import + updated_imports = "(\n " + ",\n ".join(existing_imports) + ",\n)" + + return f"from transformers.utils import {updated_imports}" + + # Replace all matches in the file content + updated_content = regex.sub(replacement_function, content) + + # add the fast image processor to the imports + base_import_string = f" from transformers import {fast_image_processor_name[:-4]}" + fast_import_string = ( + f" if is_torchvision_available():\n from transformers import {fast_image_processor_name}" + ) + if fast_import_string not in updated_content: + updated_content = updated_content.replace(base_import_string, base_import_string + "\n\n" + fast_import_string) + + # get line starting with " image_processing_class = " and add a line after it starting with " fast_image_processing_class = " + image_processing_class_line = re.search(r" image_processing_class = .*", updated_content) + if not image_processing_class_line: + logger.warning(f"Couldn't find the 'image_processing_class' line in {test_file}. Skipping.") + return + + fast_image_processing_class_line = ( + f" fast_image_processing_class = {fast_image_processor_name} if is_torchvision_available() else None" + ) + if " fast_image_processing_class = " not in updated_content: + updated_content = updated_content.replace( + image_processing_class_line.group(0), + image_processing_class_line.group(0) + "\n" + fast_image_processing_class_line, + ) + + # write the updated content + with open(test_file, "w", encoding="utf-8") as f: + f.write(updated_content) + + +def get_fast_image_processing_content_header(content: str) -> str: + """ + Get the header of the slow image processor file. + """ + # get all the commented lines at the beginning of the file + content_header = re.search(r"^# coding=utf-8\n(#[^\n]*\n)*", content, re.MULTILINE) + if not content_header: + logger.warning("Couldn't find the content header in the slow image processor file. Using a default header.") + return ( + f"# coding=utf-8\n" + f"# Copyright {CURRENT_YEAR} The HuggingFace Team. All rights reserved.\n" + f"#\n" + f'# Licensed under the Apache License, Version 2.0 (the "License");\n' + f"# you may not use this file except in compliance with the License.\n" + f"# You may obtain a copy of the License at\n" + f"#\n" + f"# http://www.apache.org/licenses/LICENSE-2.0\n" + f"#\n" + f"# Unless required by applicable law or agreed to in writing, software\n" + f'# distributed under the License is distributed on an "AS IS" BASIS,\n' + f"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n" + f"# See the License for the specific language governing permissions and\n" + f"# limitations under the License.\n" + f"\n" + ) + content_header = content_header.group(0) + # replace the year in the copyright + content_header = re.sub(r"# Copyright (\d+)\s", f"# Copyright {CURRENT_YEAR} ", content_header) + # get the line starting with """Image processor in content if it exists + match = re.search(r'^"""Image processor.*$', content, re.MULTILINE) + if match: + content_header += match.group(0).replace("Image processor", "Fast Image processor") + + return content_header + + +def write_default_fast_image_processor_file( + fast_image_processing_module_file: str, fast_image_processor_name: str, content_base_file: str +): + """ + Write a default fast image processor file. Used when encountering a problem while parsing the slow image processor file. + """ + imports = "\n\nfrom ...image_processing_utils_fast import BaseImageProcessorFast\n\n\n" + content_header = get_fast_image_processing_content_header(content_base_file) + content_base_file = ( + f"class {fast_image_processor_name}(BaseImageProcessorFast):\n" + " # To be implemented\n" + " resample = None\n" + " image_mean = None\n" + " image_std = None\n" + " size = None\n" + " default_to_square = None\n" + " crop_size = None\n" + " do_resize = None\n" + " do_center_crop = None\n" + " do_rescale = None\n" + " do_normalize = None\n" + " do_convert_rgb = None\n\n\n" + f'__all__ = ["{fast_image_processor_name}"]\n' + ) + + content = content_header + imports + content_base_file + + with open(fast_image_processing_module_file, "w", encoding="utf-8") as f: + f.write(content) + + +def add_fast_image_processor_file( + fast_image_processing_module_file: str, fast_image_processor_name: str, content_base_file: str +): + """ + Add the fast image processor file to the model's folder. + """ + # if the file already exists, do nothing + if os.path.exists(fast_image_processing_module_file): + print(f"{fast_image_processing_module_file} already exists. Skipping.") + return + + regex = rf"class {fast_image_processor_name[:-4]}.*?(\n\S|$)" + match = re.search(regex, content_base_file, re.DOTALL) + if not match: + print(f"Couldn't find the {fast_image_processor_name[:-4]} class in {fast_image_processing_module_file}") + print("Creating a new file with the default content.") + return write_default_fast_image_processor_file( + fast_image_processing_module_file, fast_image_processor_name, content_base_file + ) + # Exclude the last unindented line + slow_class_content = match.group(0).rstrip() + # get default args: + # find the __init__ block which start with def __init__ and ends with def + match = re.search(r"def __init__.*?def ", slow_class_content, re.DOTALL) + if not match: + print( + f"Couldn't find the __init__ block for {fast_image_processor_name[:-4]} in {fast_image_processing_module_file}" + ) + print("Creating a new file with the default content.") + return write_default_fast_image_processor_file( + fast_image_processing_module_file, fast_image_processor_name, content_base_file + ) + init = match.group(0) + init_signature_block = init.split(")")[0] + arg_names = init_signature_block.split(":") + arg_names = [arg_name.split("\n")[-1].strip() for arg_name in arg_names] + # get the default values + default_args = re.findall(r"= (.*?)(?:,|\))", init_signature_block) + + # build default args dict + default_args_dict = dict(zip(arg_names, default_args)) + pattern_default_size = r"size = size if size is not None else\s+(.*)" + match_default_size = re.findall(pattern_default_size, init) + default_args_dict["size"] = match_default_size[0] if match_default_size else None + pattern_default_crop_size = r"crop_size = crop_size if crop_size is not None else\s+(.*)" + match_default_crop_size = re.findall(pattern_default_crop_size, init) + default_args_dict["crop_size"] = match_default_crop_size[0] if match_default_crop_size else None + pattern_default_image_mean = r"self.image_mean = image_mean if image_mean is not None else\s+(.*)" + match_default_image_mean = re.findall(pattern_default_image_mean, init) + default_args_dict["image_mean"] = match_default_image_mean[0] if match_default_image_mean else None + pattern_default_image_std = r"self.image_std = image_std if image_std is not None else\s+(.*)" + match_default_image_std = re.findall(pattern_default_image_std, init) + default_args_dict["image_std"] = match_default_image_std[0] if match_default_image_std else None + default_args_dict["default_to_square"] = False if "(size, default_to_square=False" in init else None + + content_header = get_fast_image_processing_content_header(content_base_file) + content_base_file = ( + f"@auto_docstring\n" + f"class {fast_image_processor_name}(BaseImageProcessorFast):\n" + " # This generated class can be used as a starting point for the fast image processor.\n" + " # if the image processor is only used for simple augmentations, such as resizing, center cropping, rescaling, or normalizing,\n" + " # only the default values should be set in the class.\n" + " # If the image processor requires more complex augmentations, methods from BaseImageProcessorFast can be overridden.\n" + " # In most cases, only the `_preprocess` method should be overridden.\n\n" + " # For an example of a fast image processor requiring more complex augmentations, see `LlavaNextImageProcessorFast`.\n\n" + " # Default values should be checked against the slow image processor\n" + " # None values left after checking can be removed\n" + f" resample = {default_args_dict.get('resample')}\n" + f" image_mean = {default_args_dict.get('image_mean')}\n" + f" image_std = {default_args_dict.get('image_std')}\n" + f" size = {default_args_dict.get('size')}\n" + f" default_to_square = {default_args_dict.get('default_to_square')}\n" + f" crop_size = {default_args_dict.get('crop_size')}\n" + f" do_resize = {default_args_dict.get('do_resize')}\n" + f" do_center_crop = {default_args_dict.get('do_center_crop')}\n" + f" do_rescale = {default_args_dict.get('do_rescale')}\n" + f" do_normalize = {default_args_dict.get('do_normalize')}\n" + f" do_convert_rgb = {default_args_dict.get('do_convert_rgb')}\n\n\n" + f'__all__ = ["{fast_image_processor_name}"]\n' + ) + + imports = "\n\nfrom ...image_processing_utils_fast import BaseImageProcessorFast\n" + image_utils_imports = [] + if default_args_dict.get("resample") is not None and "PILImageResampling" in default_args_dict.get("resample"): + image_utils_imports.append("PILImageResampling") + if default_args_dict.get("image_mean") is not None and not any( + char.isdigit() for char in default_args_dict.get("image_mean") + ): + image_utils_imports.append(default_args_dict.get("image_mean")) + if default_args_dict.get("image_std") is not None and not any( + char.isdigit() for char in default_args_dict.get("image_std") + ): + image_utils_imports.append(default_args_dict.get("image_std")) + + if image_utils_imports: + # sort imports + image_utils_imports.sort() + imports += f"from ...image_utils import {', '.join(image_utils_imports)}\n" + + imports += "from ...utils import auto_docstring\n" + + content = content_header + imports + "\n\n" + content_base_file + + with open(fast_image_processing_module_file, "w", encoding="utf-8") as f: + f.write(content) + + +def add_fast_image_processor(model_name: str): + """ + Add the necessary references to the fast image processor in the transformers package, + and create the fast image processor file in the model's folder. + """ + model_module = TRANSFORMERS_PATH / "models" / model_name + image_processing_module_file = list(model_module.glob("image_processing*.py")) + if not image_processing_module_file: + raise ValueError(f"No image processing module found in {model_module}") + elif len(image_processing_module_file) > 1: + for file_name in image_processing_module_file: + if not str(file_name).endswith("_fast.py"): + image_processing_module_file = str(file_name) + break + else: + image_processing_module_file = str(image_processing_module_file[0]) + + with open(image_processing_module_file, "r", encoding="utf-8") as f: + content_base_file = f.read() + + # regex to find object starting with "class " and ending with "ImageProcessor", including "ImageProcessor" in the match + image_processor_name = re.findall(r"class (\w*ImageProcessor)", content_base_file) + if not image_processor_name: + raise ValueError(f"No ImageProcessor class found in {image_processing_module_file}") + elif len(image_processor_name) > 1: + raise ValueError(f"Multiple ImageProcessor classes found in {image_processing_module_file}") + + image_processor_name = image_processor_name[0] + fast_image_processor_name = image_processor_name + "Fast" + fast_image_processing_module_file = image_processing_module_file.replace(".py", "_fast.py") + + print(f"Adding {fast_image_processor_name} to {fast_image_processing_module_file}") + + add_fast_image_processor_to_model_init( + fast_image_processing_module_file=fast_image_processing_module_file, + fast_image_processor_name=fast_image_processor_name, + model_name=model_name, + ) + + add_fast_image_processor_to_auto( + image_processor_name=image_processor_name, + fast_image_processor_name=fast_image_processor_name, + ) + + add_fast_image_processor_to_doc( + fast_image_processor_name=fast_image_processor_name, + model_name=model_name, + ) + + add_fast_image_processor_to_tests( + fast_image_processor_name=fast_image_processor_name, + model_name=model_name, + ) + + add_fast_image_processor_file( + fast_image_processing_module_file=fast_image_processing_module_file, + fast_image_processor_name=fast_image_processor_name, + content_base_file=content_base_file, + ) + + +def add_new_model_like_command_factory(args: Namespace): + return AddFastImageProcessorCommand(model_name=args.model_name) + + +class AddFastImageProcessorCommand(BaseTransformersCLICommand): + @staticmethod + def register_subcommand(parser: ArgumentParser): + add_fast_image_processor_parser = parser.add_parser("add-fast-image-processor") + add_fast_image_processor_parser.add_argument( + "--model-name", + type=str, + required=True, + help="The name of the folder containing the model's implementation.", + ) + add_fast_image_processor_parser.set_defaults(func=add_new_model_like_command_factory) + + def __init__(self, model_name: str, *args): + self.model_name = model_name + + def run(self): + add_fast_image_processor(model_name=self.model_name) diff --git a/venv/lib/python3.13/site-packages/transformers/commands/add_new_model_like.py b/venv/lib/python3.13/site-packages/transformers/commands/add_new_model_like.py new file mode 100644 index 0000000000000000000000000000000000000000..fce524d4a6c0b219568229ea9b5ff387352a4485 --- /dev/null +++ b/venv/lib/python3.13/site-packages/transformers/commands/add_new_model_like.py @@ -0,0 +1,783 @@ +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import difflib +import os +import re +import subprocess +import textwrap +from argparse import ArgumentParser, Namespace +from datetime import date +from pathlib import Path +from typing import Any, Callable, Optional, Union + +from ..models.auto.configuration_auto import CONFIG_MAPPING_NAMES, MODEL_NAMES_MAPPING +from ..models.auto.feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING_NAMES +from ..models.auto.image_processing_auto import IMAGE_PROCESSOR_MAPPING_NAMES +from ..models.auto.processing_auto import PROCESSOR_MAPPING_NAMES +from ..models.auto.tokenization_auto import TOKENIZER_MAPPING_NAMES +from ..models.auto.video_processing_auto import VIDEO_PROCESSOR_MAPPING_NAMES +from ..utils import is_libcst_available +from . import BaseTransformersCLICommand +from .add_fast_image_processor import add_fast_image_processor + + +# We protect this import to avoid requiring it for all `transformers` CLI commands - however it is actually +# strictly required for this one (we need it both for modular and for the following Visitor) +if is_libcst_available(): + import libcst as cst + from libcst import CSTVisitor + from libcst import matchers as m + + class ClassFinder(CSTVisitor): + """ + A visitor to find all classes in a python module. + """ + + def __init__(self): + self.classes: list = [] + self.public_classes: list = [] + self.is_in_class = False + + def visit_ClassDef(self, node: cst.ClassDef) -> None: + """Record class names. We assume classes always only appear at top-level (i.e. no class definition in function or similar)""" + self.classes.append(node.name.value) + self.is_in_class = True + + def leave_ClassDef(self, node: cst.ClassDef): + self.is_in_class = False + + def visit_SimpleStatementLine(self, node: cst.SimpleStatementLine): + """Record all public classes inside the `__all__` assignment.""" + simple_top_level_assign_structure = m.SimpleStatementLine( + body=[m.Assign(targets=[m.AssignTarget(target=m.Name())])] + ) + if not self.is_in_class and m.matches(node, simple_top_level_assign_structure): + assigned_variable = node.body[0].targets[0].target.value + if assigned_variable == "__all__": + elements = node.body[0].value.elements + self.public_classes = [element.value.value for element in elements] + + +CURRENT_YEAR = date.today().year +TRANSFORMERS_PATH = Path(__file__).parent.parent +REPO_PATH = TRANSFORMERS_PATH.parent.parent + +COPYRIGHT = f""" +# coding=utf-8 +# Copyright {CURRENT_YEAR} the HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""".lstrip() + + +class ModelInfos: + """ + Retrieve the basic information about an existing model classes. + """ + + def __init__(self, lowercase_name: str): + # Just to make sure it's indeed lowercase + self.lowercase_name = lowercase_name.lower().replace(" ", "_").replace("-", "_") + if self.lowercase_name not in CONFIG_MAPPING_NAMES: + self.lowercase_name.replace("_", "-") + if self.lowercase_name not in CONFIG_MAPPING_NAMES: + raise ValueError(f"{lowercase_name} is not a valid model name") + + self.paper_name = MODEL_NAMES_MAPPING[self.lowercase_name] + self.config_class = CONFIG_MAPPING_NAMES[self.lowercase_name] + self.camelcase_name = self.config_class.replace("Config", "") + + # Get tokenizer class + if self.lowercase_name in TOKENIZER_MAPPING_NAMES: + self.tokenizer_class, self.fast_tokenizer_class = TOKENIZER_MAPPING_NAMES[self.lowercase_name] + self.fast_tokenizer_class = ( + None if self.fast_tokenizer_class == "PreTrainedTokenizerFast" else self.fast_tokenizer_class + ) + else: + self.tokenizer_class, self.fast_tokenizer_class = None, None + + self.image_processor_class, self.fast_image_processor_class = IMAGE_PROCESSOR_MAPPING_NAMES.get( + self.lowercase_name, (None, None) + ) + self.video_processor_class = VIDEO_PROCESSOR_MAPPING_NAMES.get(self.lowercase_name, None) + self.feature_extractor_class = FEATURE_EXTRACTOR_MAPPING_NAMES.get(self.lowercase_name, None) + self.processor_class = PROCESSOR_MAPPING_NAMES.get(self.lowercase_name, None) + + +def add_content_to_file(file_name: Union[str, os.PathLike], new_content: str, add_after: str): + """ + A utility to add some content inside a given file. + + Args: + file_name (`str` or `os.PathLike`): + The name of the file in which we want to insert some content. + new_content (`str`): + The content to add. + add_after (`str`): + The new content is added just after the first instance matching it. + """ + with open(file_name, "r", encoding="utf-8") as f: + old_content = f.read() + + before, after = old_content.split(add_after, 1) + new_content = before + add_after + new_content + after + + with open(file_name, "w", encoding="utf-8") as f: + f.write(new_content) + + +def add_model_to_auto_mappings( + old_model_infos: ModelInfos, + new_lowercase_name: str, + new_model_paper_name: str, + filenames_to_add: list[tuple[str, bool]], +): + """ + Add a model to all the relevant mappings in the auto module. + + Args: + old_model_infos (`ModelInfos`): + The structure containing the class information of the old model. + new_lowercase_name (`str`): + The new lowercase model name. + new_model_paper_name (`str`): + The fully cased name (as in the official paper name) of the new model. + filenames_to_add (`list[tuple[str, bool]]`): + A list of tuples of all potential filenames to add for a new model, along a boolean flag describing if we + should add this file or not. For example, [(`modeling_xxx.px`, True), (`configuration_xxx.py`, True), (`tokenization_xxx.py`, False),...] + """ + new_cased_name = "".join(x.title() for x in new_lowercase_name.replace("-", "_").split("_")) + old_lowercase_name = old_model_infos.lowercase_name + old_cased_name = old_model_infos.camelcase_name + filenames_to_add = [ + (filename.replace(old_lowercase_name, "auto"), to_add) for filename, to_add in filenames_to_add[1:] + ] + # fast tokenizer/image processor have the same auto mappings as normal ones + corrected_filenames_to_add = [] + for file, to_add in filenames_to_add: + if re.search(r"(?:tokenization)|(?:image_processing)_auto_fast.py", file): + previous_file, previous_to_add = corrected_filenames_to_add[-1] + corrected_filenames_to_add[-1] = (previous_file, previous_to_add or to_add) + else: + corrected_filenames_to_add.append((file, to_add)) + + # Add the config mappings directly as the handling for config is a bit different + add_content_to_file( + TRANSFORMERS_PATH / "models" / "auto" / "configuration_auto.py", + new_content=f' ("{new_lowercase_name}", "{new_cased_name}Config"),\n', + add_after="CONFIG_MAPPING_NAMES = OrderedDict[str, str](\n [\n # Add configs here\n", + ) + add_content_to_file( + TRANSFORMERS_PATH / "models" / "auto" / "configuration_auto.py", + new_content=f' ("{new_lowercase_name}", "{new_model_paper_name}"),\n', + add_after="MODEL_NAMES_MAPPING = OrderedDict[str, str](\n [\n # Add full (and cased) model names here\n", + ) + + for filename, to_add in corrected_filenames_to_add: + if to_add: + # The auto mapping + filename = filename.replace("_fast.py", ".py") + with open(TRANSFORMERS_PATH / "models" / "auto" / filename) as f: + file = f.read() + # The regex has to be a bit complex like this as the tokenizer mapping has new lines everywhere + matching_lines = re.findall( + rf'( {{8,12}}\(\s*"{old_lowercase_name}",.*?\),\n)(?: {{4,12}}\(|\])', file, re.DOTALL + ) + for match in matching_lines: + add_content_to_file( + TRANSFORMERS_PATH / "models" / "auto" / filename, + new_content=match.replace(old_lowercase_name, new_lowercase_name).replace( + old_cased_name, new_cased_name + ), + add_after=match, + ) + + +def create_doc_file(new_paper_name: str, public_classes: list[str]): + """ + Create a new doc file to fill for the new model. + + Args: + new_paper_name (`str`): + The fully cased name (as in the official paper name) of the new model. + public_classes (`list[str]`): + A list of all the public classes that the model will have in the library. + """ + added_note = ( + "\n\n⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that " + "may not be rendered properly in your Markdown viewer.\n\n-->\n\n" + ) + copyright_for_markdown = re.sub(r"# ?", "", COPYRIGHT).replace("coding=utf-8\n", " [{"score":x, ...}, ...] + keys = ["score", "label", "box"] + annotation = [ + dict(zip(keys, vals)) + for vals in zip(raw_annotation["scores"], raw_annotation["labels"], raw_annotation["boxes"]) + ] + + return annotation + + def _get_bounding_box(self, box: "torch.Tensor") -> dict[str, int]: + """ + Turns list [xmin, xmax, ymin, ymax] into dict { "xmin": xmin, ... } + + Args: + box (`torch.Tensor`): Tensor containing the coordinates in corners format. + + Returns: + bbox (`dict[str, int]`): Dict containing the coordinates in corners format. + """ + if self.framework != "pt": + raise ValueError("The ObjectDetectionPipeline is only available in PyTorch.") + xmin, ymin, xmax, ymax = box.int().tolist() + bbox = { + "xmin": xmin, + "ymin": ymin, + "xmax": xmax, + "ymax": ymax, + } + return bbox diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/pt_utils.py b/venv/lib/python3.13/site-packages/transformers/pipelines/pt_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..3857805962a93f796e407ed88c89495b46fd0bd0 --- /dev/null +++ b/venv/lib/python3.13/site-packages/transformers/pipelines/pt_utils.py @@ -0,0 +1,323 @@ +import numpy as np +import torch +from torch.utils.data import Dataset, IterableDataset + +from ..utils.generic import ModelOutput + + +class PipelineDataset(Dataset): + def __init__(self, dataset, process, params): + self.dataset = dataset + self.process = process + self.params = params + + def __len__(self): + return len(self.dataset) + + def __getitem__(self, i): + item = self.dataset[i] + processed = self.process(item, **self.params) + return processed + + +class PipelineIterator(IterableDataset): + def __init__(self, loader, infer, params, loader_batch_size=None): + """ + Roughly equivalent to + + ``` + for item in loader: + yield infer(item, **params) + ``` + + Arguments: + loader (`torch.utils.data.DataLoader` or `Iterable`): + The iterator that will be used to apply `infer` on. + infer (any function): + The function to apply of each element of `loader`. + params (`dict`): + The parameters passed to `infer` along with every item + loader_batch_size (`int`, *optional*): + If specified, the items of `loader` are supposed to come as batch, and are loader_batched here + making it roughly behave as + + + ``` + for items in loader: + for i in loader_batch_size: + item = items[i] + yield infer(item, **params) + ```""" + self.loader = loader + self.infer = infer + self.params = params + if loader_batch_size == 1: + # Let's spare some time by deactivating altogether + loader_batch_size = None + self.loader_batch_size = loader_batch_size + + # Internal bookkeeping + self._loader_batch_index = None + self._loader_batch_data = None + + def __len__(self): + return len(self.loader) + + def __iter__(self): + self.iterator = iter(self.loader) + return self + + def loader_batch_item(self): + """ + Return item located at `loader_batch_index` within the current `loader_batch_data`. + """ + if isinstance(self._loader_batch_data, torch.Tensor): + # Batch data is simple tensor, just fetch the slice + result = self._loader_batch_data[self._loader_batch_index].unsqueeze(0) + else: + # Batch data is assumed to be BaseModelOutput (or dict) + loader_batched = {} + for k, element in self._loader_batch_data.items(): + if isinstance(element, ModelOutput): + # Convert ModelOutput to tuple first + element = element.to_tuple() + if isinstance(element[0], torch.Tensor): + loader_batched[k] = tuple(el[self._loader_batch_index].unsqueeze(0) for el in element) + elif isinstance(element[0], np.ndarray): + loader_batched[k] = tuple(np.expand_dims(el[self._loader_batch_index], 0) for el in element) + continue + if k in {"hidden_states", "attentions"} and isinstance(element, tuple): + # Those are stored as lists of tensors so need specific unbatching. + if isinstance(element[0], torch.Tensor): + loader_batched[k] = tuple(el[self._loader_batch_index].unsqueeze(0) for el in element) + elif isinstance(element[0], np.ndarray): + loader_batched[k] = tuple(np.expand_dims(el[self._loader_batch_index], 0) for el in element) + continue + if k == "past_key_values": + continue + if element is None: + # This can happen for optional data that get passed around + loader_batched[k] = None + elif isinstance(element[self._loader_batch_index], torch.Tensor): + # Take correct batch data, but make it looked like batch_size=1 + # For compatibility with other methods within transformers + + loader_batched[k] = element[self._loader_batch_index].unsqueeze(0) + elif isinstance(element[self._loader_batch_index], np.ndarray): + # Take correct batch data, but make it looked like batch_size=1 + # For compatibility with other methods within transformers + loader_batched[k] = np.expand_dims(element[self._loader_batch_index], 0) + else: + # This is typically a list, so no need to `unsqueeze`. + loader_batched[k] = element[self._loader_batch_index] + # Recreate the element by reusing the original class to make it look + # batch_size=1 + result = self._loader_batch_data.__class__(loader_batched) + self._loader_batch_index += 1 + return result + + def __next__(self): + if self._loader_batch_index is not None and self._loader_batch_index < self.loader_batch_size: + # We are currently unrolling a batch so we just need to return + # the current item within a batch + return self.loader_batch_item() + + # We're out of items within a batch + item = next(self.iterator) + processed = self.infer(item, **self.params) + # We now have a batch of "inferred things". + if self.loader_batch_size is not None: + # Try to infer the size of the batch + if isinstance(processed, torch.Tensor): + first_tensor = processed + elif isinstance(processed, tuple): + first_tensor = processed[0] + else: + key = list(processed.keys())[0] + first_tensor = processed[key] + + if isinstance(first_tensor, list): + observed_batch_size = len(first_tensor) + else: + observed_batch_size = first_tensor.shape[0] + if 0 < observed_batch_size < self.loader_batch_size: + # could be last batch so we can't unroll as many + # elements. + self.loader_batch_size = observed_batch_size + # Setting internal index to unwrap the batch + self._loader_batch_data = processed[0] if isinstance(processed, tuple) else processed + self._loader_batch_index = 0 + return self.loader_batch_item() + else: + # We're not unrolling batches + return processed + + +class PipelineChunkIterator(PipelineIterator): + def __init__(self, loader, infer, params, loader_batch_size=None): + """ + Roughly equivalent to + + ``` + for iterator in loader: + for item in iterator: + yield infer(item, **params) + ``` + + Arguments: + loader (`torch.utils.data.DataLoader` or `Iterable`): + The iterator that will be used to apply `infer` on. + infer (any function): + The function to apply of each element of `loader`. + params (`dict`): + The parameters passed to `infer` along with every item + """ + super().__init__(loader, infer, params) + + def __iter__(self): + self.iterator = iter(self.loader) + self.subiterator = None + return self + + def __next__(self): + if self.subiterator is None: + "Subiterator None means we haven't started a `preprocess` iterator. so start it" + self.subiterator = self.infer(next(self.iterator), **self.params) + try: + # Try to return next item + processed = next(self.subiterator) + except StopIteration: + # When a preprocess iterator ends, we can start looking at the next item + # ChunkIterator will keep feeding until ALL elements of iterator + # all have created their subiterator and have been iterating against. + # + # Another way to look at it, is we're basically flattening lists of lists + # into a single list, but with generators + self.subiterator = self.infer(next(self.iterator), **self.params) + processed = next(self.subiterator) + return processed + + +class PipelinePackIterator(PipelineIterator): + """ + Roughly equivalent to + + ``` + packed = [] + for item in loader: + packed.append(item) + if item["is_last"]: + yield packed + packed = [] + ``` + + but it also handles cases where `item` are batched (meaning it's a dict of Tensor with first dimension > 1. In + that case it does + + ``` + packed = [] + for batch in loader: + # item is batched + for item in batch: + packed.append(item) + if item["is_last"]: + yield packed + packed = [] + ``` + + Arguments: + loader (`torch.utils.data.DataLoader` or `Iterable`): + The iterator that will be used to apply `infer` on. + infer (any function): + The function to apply of each element of `loader`. + params (`dict`): + The parameters passed to `infer` along with every item + loader_batch_size (`int`, *optional*): + If specified, the items of `loader` are supposed to come as batch, and are loader_batched here making + it roughly behave as + + + ``` + for items in loader: + for i in loader_batch_size: + item = items[i] + yield infer(item, **params) + ```""" + + def __iter__(self): + self.iterator = iter(self.loader) + return self + + def __next__(self): + # Extremely similar to PipelineIterator in its unpacking mechanism + # BUT, we have an extra required item which is the presence of `is_last` + # That is because everything is flattened by `PipelineChunkIterator` we + # need to keep track of how to regroup here in the original `process` + # boundaries so that `process` and `postprocess` see the same data. + + # This iterator accumulates items (possibly while unbatching) until it + # its a `is_last` and then just passes it on to the caller. + is_last = False + accumulator = [] + if self._loader_batch_index is not None and self._loader_batch_index < self.loader_batch_size: + while self._loader_batch_index < self.loader_batch_size: + item = self.loader_batch_item() + is_last = item.pop("is_last") + accumulator.append(item) + if is_last: + return accumulator + + while not is_last: + processed = self.infer(next(self.iterator), **self.params) + if self.loader_batch_size is not None: + if isinstance(processed, torch.Tensor): + first_tensor = processed + else: + key = list(processed.keys())[0] + first_tensor = processed[key] + if isinstance(first_tensor, list): + observed_batch_size = len(first_tensor) + else: + observed_batch_size = first_tensor.shape[0] + if 0 < observed_batch_size < self.loader_batch_size: + # could be last batch so we can't unroll as many + # elements. + self.loader_batch_size = observed_batch_size + self._loader_batch_data = processed + self._loader_batch_index = 0 + while self._loader_batch_index < self.loader_batch_size: + item = self.loader_batch_item() + is_last = item.pop("is_last") + accumulator.append(item) + if is_last: + return accumulator + else: + item = processed + is_last = item.pop("is_last") + accumulator.append(item) + return accumulator + + +class KeyDataset(Dataset): + def __init__(self, dataset: Dataset, key: str): + self.dataset = dataset + self.key = key + + def __len__(self): + return len(self.dataset) + + def __getitem__(self, i): + return self.dataset[i][self.key] + + +class KeyPairDataset(Dataset): + def __init__(self, dataset: Dataset, key1: str, key2: str): + self.dataset = dataset + self.key1 = key1 + self.key2 = key2 + + def __len__(self): + return len(self.dataset) + + def __getitem__(self, i): + return {"text": self.dataset[i][self.key1], "text_pair": self.dataset[i][self.key2]} diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/question_answering.py b/venv/lib/python3.13/site-packages/transformers/pipelines/question_answering.py new file mode 100644 index 0000000000000000000000000000000000000000..1958fbd1fcc8420b1f45508a5fbe5e5c8949d4ac --- /dev/null +++ b/venv/lib/python3.13/site-packages/transformers/pipelines/question_answering.py @@ -0,0 +1,707 @@ +import inspect +import types +import warnings +from collections.abc import Iterable +from typing import TYPE_CHECKING, Optional, Union + +import numpy as np + +from ..data import SquadExample, SquadFeatures, squad_convert_examples_to_features +from ..modelcard import ModelCard +from ..tokenization_utils import PreTrainedTokenizer +from ..utils import ( + PaddingStrategy, + add_end_docstrings, + is_tf_available, + is_tokenizers_available, + is_torch_available, + logging, +) +from .base import ArgumentHandler, ChunkPipeline, build_pipeline_init_args + + +logger = logging.get_logger(__name__) + +if TYPE_CHECKING: + from ..modeling_tf_utils import TFPreTrainedModel + from ..modeling_utils import PreTrainedModel + + if is_tokenizers_available(): + import tokenizers + +if is_tf_available(): + import tensorflow as tf + + from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES + + Dataset = None + +if is_torch_available(): + import torch + from torch.utils.data import Dataset + + from ..models.auto.modeling_auto import MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES + + +def decode_spans( + start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int, undesired_tokens: np.ndarray +) -> tuple: + """ + Take the output of any `ModelForQuestionAnswering` and will generate probabilities for each span to be the actual + answer. + + In addition, it filters out some unwanted/impossible cases like answer len being greater than max_answer_len or + answer end position being before the starting position. The method supports output the k-best answer through the + topk argument. + + Args: + start (`np.ndarray`): Individual start probabilities for each token. + end (`np.ndarray`): Individual end probabilities for each token. + topk (`int`): Indicates how many possible answer span(s) to extract from the model output. + max_answer_len (`int`): Maximum size of the answer to extract from the model's output. + undesired_tokens (`np.ndarray`): Mask determining tokens that can be part of the answer + """ + # Ensure we have batch axis + if start.ndim == 1: + start = start[None] + + if end.ndim == 1: + end = end[None] + + # Compute the score of each tuple(start, end) to be the real answer + outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1)) + + # Remove candidate with end < start and end - start > max_answer_len + candidates = np.tril(np.triu(outer), max_answer_len - 1) + + # Inspired by Chen & al. (https://github.com/facebookresearch/DrQA) + scores_flat = candidates.flatten() + if topk == 1: + idx_sort = [np.argmax(scores_flat)] + elif len(scores_flat) < topk: + idx_sort = np.argsort(-scores_flat) + else: + idx = np.argpartition(-scores_flat, topk)[0:topk] + idx_sort = idx[np.argsort(-scores_flat[idx])] + + starts, ends = np.unravel_index(idx_sort, candidates.shape)[1:] + desired_spans = np.isin(starts, undesired_tokens.nonzero()) & np.isin(ends, undesired_tokens.nonzero()) + starts = starts[desired_spans] + ends = ends[desired_spans] + scores = candidates[0, starts, ends] + + return starts, ends, scores + + +def select_starts_ends( + start, + end, + p_mask, + attention_mask, + min_null_score=1000000, + top_k=1, + handle_impossible_answer=False, + max_answer_len=15, +): + """ + Takes the raw output of any `ModelForQuestionAnswering` and first normalizes its outputs and then uses + `decode_spans()` to generate probabilities for each span to be the actual answer. + + Args: + start (`np.ndarray`): Individual start logits for each token. + end (`np.ndarray`): Individual end logits for each token. + p_mask (`np.ndarray`): A mask with 1 for values that cannot be in the answer + attention_mask (`np.ndarray`): The attention mask generated by the tokenizer + min_null_score(`float`): The minimum null (empty) answer score seen so far. + topk (`int`): Indicates how many possible answer span(s) to extract from the model output. + handle_impossible_answer(`bool`): Whether to allow null (empty) answers + max_answer_len (`int`): Maximum size of the answer to extract from the model's output. + """ + # Ensure padded tokens & question tokens cannot belong to the set of candidate answers. + undesired_tokens = np.abs(np.array(p_mask) - 1) + + if attention_mask is not None: + undesired_tokens = undesired_tokens & attention_mask + + # Generate mask + undesired_tokens_mask = undesired_tokens == 0.0 + + # Make sure non-context indexes in the tensor cannot contribute to the softmax + start = np.where(undesired_tokens_mask, -10000.0, start) + end = np.where(undesired_tokens_mask, -10000.0, end) + + # Normalize logits and spans to retrieve the answer + start = np.exp(start - start.max(axis=-1, keepdims=True)) + start = start / start.sum() + + end = np.exp(end - end.max(axis=-1, keepdims=True)) + end = end / end.sum() + + if handle_impossible_answer: + min_null_score = min(min_null_score, (start[0, 0] * end[0, 0]).item()) + + # Mask CLS + start[0, 0] = end[0, 0] = 0.0 + + starts, ends, scores = decode_spans(start, end, top_k, max_answer_len, undesired_tokens) + return starts, ends, scores, min_null_score + + +class QuestionAnsweringArgumentHandler(ArgumentHandler): + """ + QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped to + internal [`SquadExample`]. + + QuestionAnsweringArgumentHandler manages all the possible to create a [`SquadExample`] from the command-line + supplied arguments. + """ + + _load_processor = False + _load_image_processor = False + _load_feature_extractor = False + _load_tokenizer = True + + def normalize(self, item): + if isinstance(item, SquadExample): + return item + elif isinstance(item, dict): + for k in ["question", "context"]: + if k not in item: + raise KeyError("You need to provide a dictionary with keys {question:..., context:...}") + elif item[k] is None: + raise ValueError(f"`{k}` cannot be None") + elif isinstance(item[k], str) and len(item[k]) == 0: + raise ValueError(f"`{k}` cannot be empty") + + return QuestionAnsweringPipeline.create_sample(**item) + raise ValueError(f"{item} argument needs to be of type (SquadExample, dict)") + + def __call__(self, *args, **kwargs): + # Detect where the actual inputs are + if args is not None and len(args) > 0: + if len(args) == 1: + inputs = args[0] + elif len(args) == 2 and {type(el) for el in args} == {str}: + inputs = [{"question": args[0], "context": args[1]}] + else: + inputs = list(args) + # Generic compatibility with sklearn and Keras + # Batched data + elif "X" in kwargs: + warnings.warn( + "Passing the `X` argument to the pipeline is deprecated and will be removed in v5. Inputs should be passed using the `question` and `context` keyword arguments instead.", + FutureWarning, + ) + inputs = kwargs["X"] + elif "data" in kwargs: + warnings.warn( + "Passing the `data` argument to the pipeline is deprecated and will be removed in v5. Inputs should be passed using the `question` and `context` keyword arguments instead.", + FutureWarning, + ) + inputs = kwargs["data"] + elif "question" in kwargs and "context" in kwargs: + if isinstance(kwargs["question"], list) and isinstance(kwargs["context"], str): + inputs = [{"question": Q, "context": kwargs["context"]} for Q in kwargs["question"]] + elif isinstance(kwargs["question"], list) and isinstance(kwargs["context"], list): + if len(kwargs["question"]) != len(kwargs["context"]): + raise ValueError("Questions and contexts don't have the same lengths") + + inputs = [{"question": Q, "context": C} for Q, C in zip(kwargs["question"], kwargs["context"])] + elif isinstance(kwargs["question"], str) and isinstance(kwargs["context"], str): + inputs = [{"question": kwargs["question"], "context": kwargs["context"]}] + else: + raise ValueError("Arguments can't be understood") + else: + raise ValueError(f"Unknown arguments {kwargs}") + + # When user is sending a generator we need to trust it's a valid example + generator_types = (types.GeneratorType, Dataset) if Dataset is not None else (types.GeneratorType,) + if isinstance(inputs, generator_types): + return inputs + + # Normalize inputs + if isinstance(inputs, dict): + inputs = [inputs] + elif isinstance(inputs, Iterable): + # Copy to avoid overriding arguments + inputs = list(inputs) + else: + raise ValueError(f"Invalid arguments {kwargs}") + + for i, item in enumerate(inputs): + inputs[i] = self.normalize(item) + + return inputs + + +@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True)) +class QuestionAnsweringPipeline(ChunkPipeline): + """ + Question Answering pipeline using any `ModelForQuestionAnswering`. See the [question answering + examples](../task_summary#question-answering) for more information. + + Example: + + ```python + >>> from transformers import pipeline + + >>> oracle = pipeline(model="deepset/roberta-base-squad2") + >>> oracle(question="Where do I live?", context="My name is Wolfgang and I live in Berlin") + {'score': 0.9191, 'start': 34, 'end': 40, 'answer': 'Berlin'} + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + This question answering pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"question-answering"`. + + The models that this pipeline can use are models that have been fine-tuned on a question answering task. See the + up-to-date list of available models on + [huggingface.co/models](https://huggingface.co/models?filter=question-answering). + """ + + default_input_names = "question,context" + handle_impossible_answer = False + + def __init__( + self, + model: Union["PreTrainedModel", "TFPreTrainedModel"], + tokenizer: PreTrainedTokenizer, + modelcard: Optional[ModelCard] = None, + framework: Optional[str] = None, + task: str = "", + **kwargs, + ): + super().__init__( + model=model, + tokenizer=tokenizer, + modelcard=modelcard, + framework=framework, + task=task, + **kwargs, + ) + + self._args_parser = QuestionAnsweringArgumentHandler() + self.check_model_type( + TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES + if self.framework == "tf" + else MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES + ) + + @staticmethod + def create_sample( + question: Union[str, list[str]], context: Union[str, list[str]] + ) -> Union[SquadExample, list[SquadExample]]: + """ + QuestionAnsweringPipeline leverages the [`SquadExample`] internally. This helper method encapsulate all the + logic for converting question(s) and context(s) to [`SquadExample`]. + + We currently support extractive question answering. + + Arguments: + question (`str` or `list[str]`): The question(s) asked. + context (`str` or `list[str]`): The context(s) in which we will look for the answer. + + Returns: + One or a list of [`SquadExample`]: The corresponding [`SquadExample`] grouping question and context. + """ + if isinstance(question, list): + return [SquadExample(None, q, c, None, None, None) for q, c in zip(question, context)] + else: + return SquadExample(None, question, context, None, None, None) + + def _sanitize_parameters( + self, + padding=None, + topk=None, + top_k=None, + doc_stride=None, + max_answer_len=None, + max_seq_len=None, + max_question_len=None, + handle_impossible_answer=None, + align_to_words=None, + **kwargs, + ): + # Set defaults values + preprocess_params = {} + if padding is not None: + preprocess_params["padding"] = padding + if doc_stride is not None: + preprocess_params["doc_stride"] = doc_stride + if max_question_len is not None: + preprocess_params["max_question_len"] = max_question_len + if max_seq_len is not None: + preprocess_params["max_seq_len"] = max_seq_len + + postprocess_params = {} + if topk is not None and top_k is None: + warnings.warn("topk parameter is deprecated, use top_k instead", UserWarning) + top_k = topk + if top_k is not None: + if top_k < 1: + raise ValueError(f"top_k parameter should be >= 1 (got {top_k})") + postprocess_params["top_k"] = top_k + if max_answer_len is not None: + if max_answer_len < 1: + raise ValueError(f"max_answer_len parameter should be >= 1 (got {max_answer_len}") + postprocess_params["max_answer_len"] = max_answer_len + if handle_impossible_answer is not None: + postprocess_params["handle_impossible_answer"] = handle_impossible_answer + if align_to_words is not None: + postprocess_params["align_to_words"] = align_to_words + return preprocess_params, {}, postprocess_params + + def __call__(self, *args, **kwargs): + """ + Answer the question(s) given as inputs by using the context(s). + + Args: + question (`str` or `list[str]`): + One or several question(s) (must be used in conjunction with the `context` argument). + context (`str` or `list[str]`): + One or several context(s) associated with the question(s) (must be used in conjunction with the + `question` argument). + top_k (`int`, *optional*, defaults to 1): + The number of answers to return (will be chosen by order of likelihood). Note that we return less than + top_k answers if there are not enough options available within the context. + doc_stride (`int`, *optional*, defaults to 128): + If the context is too long to fit with the question for the model, it will be split in several chunks + with some overlap. This argument controls the size of that overlap. + max_answer_len (`int`, *optional*, defaults to 15): + The maximum length of predicted answers (e.g., only answers with a shorter length are considered). + max_seq_len (`int`, *optional*, defaults to 384): + The maximum length of the total sentence (context + question) in tokens of each chunk passed to the + model. The context will be split in several chunks (using `doc_stride` as overlap) if needed. + max_question_len (`int`, *optional*, defaults to 64): + The maximum length of the question after tokenization. It will be truncated if needed. + handle_impossible_answer (`bool`, *optional*, defaults to `False`): + Whether or not we accept impossible as an answer. + align_to_words (`bool`, *optional*, defaults to `True`): + Attempts to align the answer to real words. Improves quality on space separated languages. Might hurt on + non-space-separated languages (like Japanese or Chinese) + + Return: + A `dict` or a list of `dict`: Each result comes as a dictionary with the following keys: + + - **score** (`float`) -- The probability associated to the answer. + - **start** (`int`) -- The character start index of the answer (in the tokenized version of the input). + - **end** (`int`) -- The character end index of the answer (in the tokenized version of the input). + - **answer** (`str`) -- The answer to the question. + """ + + # Convert inputs to features + if args: + warnings.warn( + "Passing a list of SQuAD examples to the pipeline is deprecated and will be removed in v5. Inputs should be passed using the `question` and `context` keyword arguments instead.", + FutureWarning, + ) + + examples = self._args_parser(*args, **kwargs) + if isinstance(examples, (list, tuple)) and len(examples) == 1: + return super().__call__(examples[0], **kwargs) + return super().__call__(examples, **kwargs) + + def preprocess(self, example, padding="do_not_pad", doc_stride=None, max_question_len=64, max_seq_len=None): + # XXX: This is special, args_parser will not handle anything generator or dataset like + # For those we expect user to send a simple valid example either directly as a SquadExample or simple dict. + # So we still need a little sanitation here. + if isinstance(example, dict): + example = SquadExample(None, example["question"], example["context"], None, None, None) + + if max_seq_len is None: + max_seq_len = min(self.tokenizer.model_max_length, 384) + if doc_stride is None: + doc_stride = min(max_seq_len // 2, 128) + + if doc_stride > max_seq_len: + raise ValueError(f"`doc_stride` ({doc_stride}) is larger than `max_seq_len` ({max_seq_len})") + + if not self.tokenizer.is_fast: + features = squad_convert_examples_to_features( + examples=[example], + tokenizer=self.tokenizer, + max_seq_length=max_seq_len, + doc_stride=doc_stride, + max_query_length=max_question_len, + padding_strategy=PaddingStrategy.MAX_LENGTH, + is_training=False, + tqdm_enabled=False, + ) + else: + # Define the side we want to truncate / pad and the text/pair sorting + question_first = self.tokenizer.padding_side == "right" + + encoded_inputs = self.tokenizer( + text=example.question_text if question_first else example.context_text, + text_pair=example.context_text if question_first else example.question_text, + padding=padding, + truncation="only_second" if question_first else "only_first", + max_length=max_seq_len, + stride=doc_stride, + return_token_type_ids=True, + return_overflowing_tokens=True, + return_offsets_mapping=True, + return_special_tokens_mask=True, + ) + # When the input is too long, it's converted in a batch of inputs with overflowing tokens + # and a stride of overlap between the inputs. If a batch of inputs is given, a special output + # "overflow_to_sample_mapping" indicate which member of the encoded batch belong to which original batch sample. + # Here we tokenize examples one-by-one so we don't need to use "overflow_to_sample_mapping". + # "num_span" is the number of output samples generated from the overflowing tokens. + num_spans = len(encoded_inputs["input_ids"]) + + # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) + # We put 0 on the tokens from the context and 1 everywhere else (question and special tokens) + p_mask = [ + [tok != 1 if question_first else 0 for tok in encoded_inputs.sequence_ids(span_id)] + for span_id in range(num_spans) + ] + + features = [] + for span_idx in range(num_spans): + input_ids_span_idx = encoded_inputs["input_ids"][span_idx] + attention_mask_span_idx = ( + encoded_inputs["attention_mask"][span_idx] if "attention_mask" in encoded_inputs else None + ) + token_type_ids_span_idx = ( + encoded_inputs["token_type_ids"][span_idx] if "token_type_ids" in encoded_inputs else None + ) + # keep the cls_token unmasked (some models use it to indicate unanswerable questions) + if self.tokenizer.cls_token_id is not None: + cls_indices = np.nonzero(np.array(input_ids_span_idx) == self.tokenizer.cls_token_id)[0] + for cls_index in cls_indices: + p_mask[span_idx][cls_index] = 0 + submask = p_mask[span_idx] + features.append( + SquadFeatures( + input_ids=input_ids_span_idx, + attention_mask=attention_mask_span_idx, + token_type_ids=token_type_ids_span_idx, + p_mask=submask, + encoding=encoded_inputs[span_idx], + # We don't use the rest of the values - and actually + # for Fast tokenizer we could totally avoid using SquadFeatures and SquadExample + cls_index=None, + token_to_orig_map={}, + example_index=0, + unique_id=0, + paragraph_len=0, + token_is_max_context=0, + tokens=[], + start_position=0, + end_position=0, + is_impossible=False, + qas_id=None, + ) + ) + + for i, feature in enumerate(features): + fw_args = {} + others = {} + model_input_names = self.tokenizer.model_input_names + ["p_mask", "token_type_ids"] + + for k, v in feature.__dict__.items(): + if k in model_input_names: + if self.framework == "tf": + tensor = tf.constant(v) + if tensor.dtype == tf.int64: + tensor = tf.cast(tensor, tf.int32) + fw_args[k] = tf.expand_dims(tensor, 0) + elif self.framework == "pt": + tensor = torch.tensor(v) + if tensor.dtype == torch.int32: + tensor = tensor.long() + fw_args[k] = tensor.unsqueeze(0) + else: + others[k] = v + + is_last = i == len(features) - 1 + yield {"example": example, "is_last": is_last, **fw_args, **others} + + def _forward(self, inputs): + example = inputs["example"] + model_inputs = {k: inputs[k] for k in self.tokenizer.model_input_names} + # `XXXForSequenceClassification` models should not use `use_cache=True` even if it's supported + model_forward = self.model.forward if self.framework == "pt" else self.model.call + if "use_cache" in inspect.signature(model_forward).parameters: + model_inputs["use_cache"] = False + output = self.model(**model_inputs) + if isinstance(output, dict): + return {"start": output["start_logits"], "end": output["end_logits"], "example": example, **inputs} + else: + start, end = output[:2] + return {"start": start, "end": end, "example": example, **inputs} + + def postprocess( + self, + model_outputs, + top_k=1, + handle_impossible_answer=False, + max_answer_len=15, + align_to_words=True, + ): + min_null_score = 1000000 # large and positive + answers = [] + for output in model_outputs: + if self.framework == "pt" and output["start"].dtype == torch.bfloat16: + start_ = output["start"].to(torch.float32) + end_ = output["end"].to(torch.float32) + else: + start_ = output["start"] + end_ = output["end"] + example = output["example"] + p_mask = output["p_mask"] + attention_mask = ( + output["attention_mask"].numpy() if output.get("attention_mask", None) is not None else None + ) + + pre_topk = ( + top_k * 2 + 10 if align_to_words else top_k + ) # Some candidates may be deleted if we align to words + starts, ends, scores, min_null_score = select_starts_ends( + start_, + end_, + p_mask, + attention_mask, + min_null_score, + pre_topk, + handle_impossible_answer, + max_answer_len, + ) + + if not self.tokenizer.is_fast: + char_to_word = np.array(example.char_to_word_offset) + + # Convert the answer (tokens) back to the original text + # Score: score from the model + # Start: Index of the first character of the answer in the context string + # End: Index of the character following the last character of the answer in the context string + # Answer: Plain text of the answer + for s, e, score in zip(starts, ends, scores): + token_to_orig_map = output["token_to_orig_map"] + answers.append( + { + "score": score.item(), + "start": np.where(char_to_word == token_to_orig_map[s])[0][0].item(), + "end": np.where(char_to_word == token_to_orig_map[e])[0][-1].item(), + "answer": " ".join(example.doc_tokens[token_to_orig_map[s] : token_to_orig_map[e] + 1]), + } + ) + else: + # Convert the answer (tokens) back to the original text + # Score: score from the model + # Start: Index of the first character of the answer in the context string + # End: Index of the character following the last character of the answer in the context string + # Answer: Plain text of the answer + question_first = self.tokenizer.padding_side == "right" + enc = output["encoding"] + + # Encoding was *not* padded, input_ids *might*. + # It doesn't make a difference unless we're padding on + # the left hand side, since now we have different offsets + # everywhere. + if self.tokenizer.padding_side == "left": + offset = (output["input_ids"] == self.tokenizer.pad_token_id).numpy().sum() + else: + offset = 0 + + # Sometimes the max probability token is in the middle of a word so: + # - we start by finding the right word containing the token with `token_to_word` + # - then we convert this word in a character span with `word_to_chars` + sequence_index = 1 if question_first else 0 + + for s, e, score in zip(starts, ends, scores): + s = s - offset + e = e - offset + + start_index, end_index = self.get_indices(enc, s, e, sequence_index, align_to_words) + + target_answer = example.context_text[start_index:end_index] + answer = self.get_answer(answers, target_answer) + + if answer: + answer["score"] += score.item() + else: + answers.append( + { + "score": score.item(), + "start": start_index, + "end": end_index, + "answer": example.context_text[start_index:end_index], + } + ) + + if handle_impossible_answer: + answers.append({"score": min_null_score, "start": 0, "end": 0, "answer": ""}) + answers = sorted(answers, key=lambda x: x["score"], reverse=True)[:top_k] + if len(answers) == 1: + return answers[0] + return answers + + def get_answer(self, answers: list[dict], target: str) -> Optional[dict]: + for answer in answers: + if answer["answer"].lower() == target.lower(): + return answer + return None + + def get_indices( + self, enc: "tokenizers.Encoding", s: int, e: int, sequence_index: int, align_to_words: bool + ) -> tuple[int, int]: + if align_to_words: + try: + start_word = enc.token_to_word(s) + end_word = enc.token_to_word(e) + start_index = enc.word_to_chars(start_word, sequence_index=sequence_index)[0] + end_index = enc.word_to_chars(end_word, sequence_index=sequence_index)[1] + except Exception: + # Some tokenizers don't really handle words. Keep to offsets then. + start_index = enc.offsets[s][0] + end_index = enc.offsets[e][1] + else: + start_index = enc.offsets[s][0] + end_index = enc.offsets[e][1] + return start_index, end_index + + def span_to_answer(self, text: str, start: int, end: int) -> dict[str, Union[str, int]]: + """ + When decoding from token probabilities, this method maps token indexes to actual word in the initial context. + + Args: + text (`str`): The actual context to extract the answer from. + start (`int`): The answer starting token index. + end (`int`): The answer end token index. + + Returns: + Dictionary like `{'answer': str, 'start': int, 'end': int}` + """ + words = [] + token_idx = char_start_idx = char_end_idx = chars_idx = 0 + + for word in text.split(" "): + token = self.tokenizer.tokenize(word) + + # Append words if they are in the span + if start <= token_idx <= end: + if token_idx == start: + char_start_idx = chars_idx + + if token_idx == end: + char_end_idx = chars_idx + len(word) + + words += [word] + + # Stop if we went over the end of the answer + if token_idx > end: + break + + # Append the subtokenization length to the running index + token_idx += len(token) + chars_idx += len(word) + 1 + + # Join text with spaces + return { + "answer": " ".join(words), + "start": max(0, char_start_idx), + "end": min(len(text), char_end_idx), + } diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/table_question_answering.py b/venv/lib/python3.13/site-packages/transformers/pipelines/table_question_answering.py new file mode 100644 index 0000000000000000000000000000000000000000..4eba8ad64cf2e756f084a7726c89de1df3ce5cec --- /dev/null +++ b/venv/lib/python3.13/site-packages/transformers/pipelines/table_question_answering.py @@ -0,0 +1,457 @@ +import collections +import types + +import numpy as np + +from ..generation import GenerationConfig +from ..utils import ( + add_end_docstrings, + is_tf_available, + is_torch_available, + requires_backends, +) +from .base import ArgumentHandler, Dataset, Pipeline, PipelineException, build_pipeline_init_args + + +if is_torch_available(): + import torch + + from ..models.auto.modeling_auto import ( + MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, + MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES, + ) + +if is_tf_available(): + import tensorflow as tf + + from ..models.auto.modeling_tf_auto import ( + TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, + TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES, + ) + + +class TableQuestionAnsweringArgumentHandler(ArgumentHandler): + """ + Handles arguments for the TableQuestionAnsweringPipeline + """ + + def __call__(self, table=None, query=None, **kwargs): + # Returns tqa_pipeline_inputs of shape: + # [ + # {"table": pd.DataFrame, "query": list[str]}, + # ..., + # {"table": pd.DataFrame, "query" : list[str]} + # ] + requires_backends(self, "pandas") + import pandas as pd + + if table is None: + raise ValueError("Keyword argument `table` cannot be None.") + elif query is None: + if isinstance(table, dict) and table.get("query") is not None and table.get("table") is not None: + tqa_pipeline_inputs = [table] + elif isinstance(table, list) and len(table) > 0: + if not all(isinstance(d, dict) for d in table): + raise ValueError( + f"Keyword argument `table` should be a list of dict, but is {(type(d) for d in table)}" + ) + + if table[0].get("query") is not None and table[0].get("table") is not None: + tqa_pipeline_inputs = table + else: + raise ValueError( + "If keyword argument `table` is a list of dictionaries, each dictionary should have a `table`" + f" and `query` key, but only dictionary has keys {table[0].keys()} `table` and `query` keys." + ) + elif Dataset is not None and isinstance(table, Dataset) or isinstance(table, types.GeneratorType): + return table + else: + raise ValueError( + "Invalid input. Keyword argument `table` should be either of type `dict` or `list`, but " + f"is {type(table)})" + ) + else: + tqa_pipeline_inputs = [{"table": table, "query": query}] + + for tqa_pipeline_input in tqa_pipeline_inputs: + if not isinstance(tqa_pipeline_input["table"], pd.DataFrame): + if tqa_pipeline_input["table"] is None: + raise ValueError("Table cannot be None.") + + tqa_pipeline_input["table"] = pd.DataFrame(tqa_pipeline_input["table"]) + + return tqa_pipeline_inputs + + +@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True)) +class TableQuestionAnsweringPipeline(Pipeline): + """ + Table Question Answering pipeline using a `ModelForTableQuestionAnswering`. This pipeline is only available in + PyTorch. + + Unless the model you're using explicitly sets these generation parameters in its configuration files + (`generation_config.json`), the following default values will be used: + - max_new_tokens: 256 + + Example: + + ```python + >>> from transformers import pipeline + + >>> oracle = pipeline(model="google/tapas-base-finetuned-wtq") + >>> table = { + ... "Repository": ["Transformers", "Datasets", "Tokenizers"], + ... "Stars": ["36542", "4512", "3934"], + ... "Contributors": ["651", "77", "34"], + ... "Programming language": ["Python", "Python", "Rust, Python and NodeJS"], + ... } + >>> oracle(query="How many stars does the transformers repository have?", table=table) + {'answer': 'AVERAGE > 36542', 'coordinates': [(0, 1)], 'cells': ['36542'], 'aggregator': 'AVERAGE'} + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + This tabular question answering pipeline can currently be loaded from [`pipeline`] using the following task + identifier: `"table-question-answering"`. + + The models that this pipeline can use are models that have been fine-tuned on a tabular question answering task. + See the up-to-date list of available models on + [huggingface.co/models](https://huggingface.co/models?filter=table-question-answering). + """ + + default_input_names = "table,query" + + _pipeline_calls_generate = True + _load_processor = False + _load_image_processor = False + _load_feature_extractor = False + _load_tokenizer = True + # Make sure the docstring is updated when the default generation config is changed + _default_generation_config = GenerationConfig( + max_new_tokens=256, + ) + + def __init__(self, args_parser=TableQuestionAnsweringArgumentHandler(), **kwargs): + super().__init__(**kwargs) + self._args_parser = args_parser + + if self.framework == "tf": + mapping = TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES.copy() + mapping.update(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES) + else: + mapping = MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES.copy() + mapping.update(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES) + self.check_model_type(mapping) + + self.aggregate = getattr(self.model.config, "aggregation_labels", None) and getattr( + self.model.config, "num_aggregation_labels", None + ) + self.type = "tapas" if hasattr(self.model.config, "aggregation_labels") else None + + def batch_inference(self, **inputs): + return self.model(**inputs) + + def sequential_inference(self, **inputs): + """ + Inference used for models that need to process sequences in a sequential fashion, like the SQA models which + handle conversational query related to a table. + """ + if self.framework == "pt": + all_logits = [] + all_aggregations = [] + prev_answers = None + batch_size = inputs["input_ids"].shape[0] + + input_ids = inputs["input_ids"].to(self.device) + attention_mask = inputs["attention_mask"].to(self.device) + token_type_ids = inputs["token_type_ids"].to(self.device) + token_type_ids_example = None + + for index in range(batch_size): + # If sequences have already been processed, the token type IDs will be created according to the previous + # answer. + if prev_answers is not None: + prev_labels_example = token_type_ids_example[:, 3] # shape (seq_len,) + model_labels = np.zeros_like(prev_labels_example.cpu().numpy()) # shape (seq_len,) + + token_type_ids_example = token_type_ids[index] # shape (seq_len, 7) + for i in range(model_labels.shape[0]): + segment_id = token_type_ids_example[:, 0].tolist()[i] + col_id = token_type_ids_example[:, 1].tolist()[i] - 1 + row_id = token_type_ids_example[:, 2].tolist()[i] - 1 + + if row_id >= 0 and col_id >= 0 and segment_id == 1: + model_labels[i] = int(prev_answers[(col_id, row_id)]) + + token_type_ids_example[:, 3] = torch.from_numpy(model_labels).type(torch.long).to(self.device) + + input_ids_example = input_ids[index] + attention_mask_example = attention_mask[index] # shape (seq_len,) + token_type_ids_example = token_type_ids[index] # shape (seq_len, 7) + outputs = self.model( + input_ids=input_ids_example.unsqueeze(0), + attention_mask=attention_mask_example.unsqueeze(0), + token_type_ids=token_type_ids_example.unsqueeze(0), + ) + logits = outputs.logits + + if self.aggregate: + all_aggregations.append(outputs.logits_aggregation) + + all_logits.append(logits) + + dist_per_token = torch.distributions.Bernoulli(logits=logits) + probabilities = dist_per_token.probs * attention_mask_example.type(torch.float32).to( + dist_per_token.probs.device + ) + + coords_to_probs = collections.defaultdict(list) + for i, p in enumerate(probabilities.squeeze().tolist()): + segment_id = token_type_ids_example[:, 0].tolist()[i] + col = token_type_ids_example[:, 1].tolist()[i] - 1 + row = token_type_ids_example[:, 2].tolist()[i] - 1 + if col >= 0 and row >= 0 and segment_id == 1: + coords_to_probs[(col, row)].append(p) + + prev_answers = {key: np.array(coords_to_probs[key]).mean() > 0.5 for key in coords_to_probs} + + logits_batch = torch.cat(tuple(all_logits), 0) + + return (logits_batch,) if not self.aggregate else (logits_batch, torch.cat(tuple(all_aggregations), 0)) + else: + all_logits = [] + all_aggregations = [] + prev_answers = None + batch_size = inputs["input_ids"].shape[0] + + input_ids = inputs["input_ids"] + attention_mask = inputs["attention_mask"] + token_type_ids = inputs["token_type_ids"].numpy() + token_type_ids_example = None + + for index in range(batch_size): + # If sequences have already been processed, the token type IDs will be created according to the previous + # answer. + if prev_answers is not None: + prev_labels_example = token_type_ids_example[:, 3] # shape (seq_len,) + model_labels = np.zeros_like(prev_labels_example, dtype=np.int32) # shape (seq_len,) + + token_type_ids_example = token_type_ids[index] # shape (seq_len, 7) + for i in range(model_labels.shape[0]): + segment_id = token_type_ids_example[:, 0].tolist()[i] + col_id = token_type_ids_example[:, 1].tolist()[i] - 1 + row_id = token_type_ids_example[:, 2].tolist()[i] - 1 + + if row_id >= 0 and col_id >= 0 and segment_id == 1: + model_labels[i] = int(prev_answers[(col_id, row_id)]) + + token_type_ids_example[:, 3] = model_labels + + input_ids_example = input_ids[index] + attention_mask_example = attention_mask[index] # shape (seq_len,) + token_type_ids_example = token_type_ids[index] # shape (seq_len, 7) + outputs = self.model( + input_ids=np.expand_dims(input_ids_example, axis=0), + attention_mask=np.expand_dims(attention_mask_example, axis=0), + token_type_ids=np.expand_dims(token_type_ids_example, axis=0), + ) + logits = outputs.logits + + if self.aggregate: + all_aggregations.append(outputs.logits_aggregation) + + all_logits.append(logits) + + probabilities = tf.math.sigmoid(tf.cast(logits, tf.float32)) * tf.cast( + attention_mask_example, tf.float32 + ) + + coords_to_probs = collections.defaultdict(list) + for i, p in enumerate(tf.squeeze(probabilities).numpy().tolist()): + segment_id = token_type_ids_example[:, 0].tolist()[i] + col = token_type_ids_example[:, 1].tolist()[i] - 1 + row = token_type_ids_example[:, 2].tolist()[i] - 1 + if col >= 0 and row >= 0 and segment_id == 1: + coords_to_probs[(col, row)].append(p) + + prev_answers = {key: np.array(coords_to_probs[key]).mean() > 0.5 for key in coords_to_probs} + + logits_batch = tf.concat(tuple(all_logits), 0) + + return (logits_batch,) if not self.aggregate else (logits_batch, tf.concat(tuple(all_aggregations), 0)) + + def __call__(self, *args, **kwargs): + r""" + Answers queries according to a table. The pipeline accepts several types of inputs which are detailed below: + + - `pipeline(table, query)` + - `pipeline(table, [query])` + - `pipeline(table=table, query=query)` + - `pipeline(table=table, query=[query])` + - `pipeline({"table": table, "query": query})` + - `pipeline({"table": table, "query": [query]})` + - `pipeline([{"table": table, "query": query}, {"table": table, "query": query}])` + + The `table` argument should be a dict or a DataFrame built from that dict, containing the whole table: + + Example: + + ```python + data = { + "actors": ["brad pitt", "leonardo di caprio", "george clooney"], + "age": ["56", "45", "59"], + "number of movies": ["87", "53", "69"], + "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"], + } + ``` + + This dictionary can be passed in as such, or can be converted to a pandas DataFrame: + + Example: + + ```python + import pandas as pd + + table = pd.DataFrame.from_dict(data) + ``` + + Args: + table (`pd.DataFrame` or `Dict`): + Pandas DataFrame or dictionary that will be converted to a DataFrame containing all the table values. + See above for an example of dictionary. + query (`str` or `list[str]`): + Query or list of queries that will be sent to the model alongside the table. + sequential (`bool`, *optional*, defaults to `False`): + Whether to do inference sequentially or as a batch. Batching is faster, but models like SQA require the + inference to be done sequentially to extract relations within sequences, given their conversational + nature. + padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`): + Activates and controls padding. Accepts the following values: + + - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single + sequence if provided). + - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. + - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different + lengths). + + truncation (`bool`, `str` or [`TapasTruncationStrategy`], *optional*, defaults to `False`): + Activates and controls truncation. Accepts the following values: + + - `True` or `'drop_rows_to_fit'`: Truncate to a maximum length specified with the argument `max_length` + or to the maximum acceptable input length for the model if that argument is not provided. This will + truncate row by row, removing rows from the table. + - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths + greater than the model maximum admissible input size). + + + Return: + A dictionary or a list of dictionaries containing results: Each result is a dictionary with the following + keys: + + - **answer** (`str`) -- The answer of the query given the table. If there is an aggregator, the answer will + be preceded by `AGGREGATOR >`. + - **coordinates** (`list[tuple[int, int]]`) -- Coordinates of the cells of the answers. + - **cells** (`list[str]`) -- List of strings made up of the answer cell values. + - **aggregator** (`str`) -- If the model has an aggregator, this returns the aggregator. + """ + pipeline_inputs = self._args_parser(*args, **kwargs) + + results = super().__call__(pipeline_inputs, **kwargs) + if len(results) == 1: + return results[0] + return results + + def _sanitize_parameters(self, sequential=None, padding=None, truncation=None, **kwargs): + preprocess_params = {} + if padding is not None: + preprocess_params["padding"] = padding + if truncation is not None: + preprocess_params["truncation"] = truncation + + forward_params = {} + if sequential is not None: + forward_params["sequential"] = sequential + + if getattr(self, "assistant_model", None) is not None: + forward_params["assistant_model"] = self.assistant_model + if getattr(self, "assistant_tokenizer", None) is not None: + forward_params["tokenizer"] = self.tokenizer + forward_params["assistant_tokenizer"] = self.assistant_tokenizer + + return preprocess_params, forward_params, {} + + def preprocess(self, pipeline_input, padding=True, truncation=None): + if truncation is None: + if self.type == "tapas": + truncation = "drop_rows_to_fit" + else: + truncation = "do_not_truncate" + + table, query = pipeline_input["table"], pipeline_input["query"] + if table.empty: + raise ValueError("table is empty") + if query is None or query == "": + raise ValueError("query is empty") + inputs = self.tokenizer(table, query, return_tensors=self.framework, truncation=truncation, padding=padding) + inputs["table"] = table + return inputs + + def _forward(self, model_inputs, sequential=False, **generate_kwargs): + table = model_inputs.pop("table") + + if self.type == "tapas": + if sequential: + outputs = self.sequential_inference(**model_inputs) + else: + outputs = self.batch_inference(**model_inputs) + else: + # User-defined `generation_config` passed to the pipeline call take precedence + if "generation_config" not in generate_kwargs: + generate_kwargs["generation_config"] = self.generation_config + + outputs = self.model.generate(**model_inputs, **generate_kwargs) + model_outputs = {"model_inputs": model_inputs, "table": table, "outputs": outputs} + return model_outputs + + def postprocess(self, model_outputs): + inputs = model_outputs["model_inputs"] + table = model_outputs["table"] + outputs = model_outputs["outputs"] + if self.type == "tapas": + if self.aggregate: + logits, logits_agg = outputs[:2] + predictions = self.tokenizer.convert_logits_to_predictions(inputs, logits, logits_agg) + answer_coordinates_batch, agg_predictions = predictions + aggregators = {i: self.model.config.aggregation_labels[pred] for i, pred in enumerate(agg_predictions)} + + no_agg_label_index = self.model.config.no_aggregation_label_index + aggregators_prefix = { + i: aggregators[i] + " > " for i, pred in enumerate(agg_predictions) if pred != no_agg_label_index + } + else: + logits = outputs[0] + predictions = self.tokenizer.convert_logits_to_predictions(inputs, logits) + answer_coordinates_batch = predictions[0] + aggregators = {} + aggregators_prefix = {} + answers = [] + for index, coordinates in enumerate(answer_coordinates_batch): + cells = [table.iat[coordinate] for coordinate in coordinates] + aggregator = aggregators.get(index, "") + aggregator_prefix = aggregators_prefix.get(index, "") + answer = { + "answer": aggregator_prefix + ", ".join(cells), + "coordinates": coordinates, + "cells": [table.iat[coordinate] for coordinate in coordinates], + } + if aggregator: + answer["aggregator"] = aggregator + + answers.append(answer) + if len(answer) == 0: + raise PipelineException("Table question answering", self.model.name_or_path, "Empty answer") + else: + answers = [{"answer": answer} for answer in self.tokenizer.batch_decode(outputs, skip_special_tokens=True)] + + return answers if len(answers) > 1 else answers[0] diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/text2text_generation.py b/venv/lib/python3.13/site-packages/transformers/pipelines/text2text_generation.py new file mode 100644 index 0000000000000000000000000000000000000000..8952b58208677ec1984b163ff2a34318ff6f8267 --- /dev/null +++ b/venv/lib/python3.13/site-packages/transformers/pipelines/text2text_generation.py @@ -0,0 +1,410 @@ +import enum +import warnings +from typing import Any, Union + +from ..generation import GenerationConfig +from ..tokenization_utils import TruncationStrategy +from ..utils import add_end_docstrings, is_tf_available, is_torch_available, logging +from .base import Pipeline, build_pipeline_init_args + + +if is_tf_available(): + import tensorflow as tf + + from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES + +if is_torch_available(): + from ..models.auto.modeling_auto import MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES + +logger = logging.get_logger(__name__) + + +class ReturnType(enum.Enum): + TENSORS = 0 + TEXT = 1 + + +@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True)) +class Text2TextGenerationPipeline(Pipeline): + """ + Pipeline for text to text generation using seq2seq models. + + Unless the model you're using explicitly sets these generation parameters in its configuration files + (`generation_config.json`), the following default values will be used: + - max_new_tokens: 256 + - num_beams: 4 + + Example: + + ```python + >>> from transformers import pipeline + + >>> generator = pipeline(model="mrm8488/t5-base-finetuned-question-generation-ap") + >>> generator( + ... "answer: Manuel context: Manuel has created RuPERTa-base with the support of HF-Transformers and Google" + ... ) + [{'generated_text': 'question: Who created the RuPERTa-base?'}] + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial). You can pass text + generation parameters to this pipeline to control stopping criteria, decoding strategy, and more. Learn more about + text generation parameters in [Text generation strategies](../generation_strategies) and [Text + generation](text_generation). + + This Text2TextGenerationPipeline pipeline can currently be loaded from [`pipeline`] using the following task + identifier: `"text2text-generation"`. + + The models that this pipeline can use are models that have been fine-tuned on a translation task. See the + up-to-date list of available models on + [huggingface.co/models](https://huggingface.co/models?filter=text2text-generation). For a list of available + parameters, see the [following + documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.generation.GenerationMixin.generate) + + Usage: + + ```python + text2text_generator = pipeline("text2text-generation") + text2text_generator("question: What is 42 ? context: 42 is the answer to life, the universe and everything") + ```""" + + _pipeline_calls_generate = True + _load_processor = False + _load_image_processor = False + _load_feature_extractor = False + _load_tokenizer = True + # Make sure the docstring is updated when the default generation config is changed (in all pipelines in this file) + _default_generation_config = GenerationConfig( + max_new_tokens=256, + num_beams=4, + ) + + # Used in the return key of the pipeline. + return_name = "generated" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.check_model_type( + TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES + if self.framework == "tf" + else MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES + ) + + def _sanitize_parameters( + self, + return_tensors=None, + return_text=None, + return_type=None, + clean_up_tokenization_spaces=None, + truncation=None, + stop_sequence=None, + **generate_kwargs, + ): + preprocess_params = {} + if truncation is not None: + preprocess_params["truncation"] = truncation + + forward_params = generate_kwargs + + postprocess_params = {} + if return_tensors is not None and return_type is None: + return_type = ReturnType.TENSORS if return_tensors else ReturnType.TEXT + if return_type is not None: + postprocess_params["return_type"] = return_type + + if clean_up_tokenization_spaces is not None: + postprocess_params["clean_up_tokenization_spaces"] = clean_up_tokenization_spaces + + if stop_sequence is not None: + stop_sequence_ids = self.tokenizer.encode(stop_sequence, add_special_tokens=False) + if len(stop_sequence_ids) > 1: + warnings.warn( + "Stopping on a multiple token sequence is not yet supported on transformers. The first token of" + " the stop sequence will be used as the stop sequence string in the interim." + ) + generate_kwargs["eos_token_id"] = stop_sequence_ids[0] + + if self.assistant_model is not None: + forward_params["assistant_model"] = self.assistant_model + if self.assistant_tokenizer is not None: + forward_params["tokenizer"] = self.tokenizer + forward_params["assistant_tokenizer"] = self.assistant_tokenizer + + return preprocess_params, forward_params, postprocess_params + + def check_inputs(self, input_length: int, min_length: int, max_length: int): + """ + Checks whether there might be something wrong with given input with regard to the model. + """ + return True + + def _parse_and_tokenize(self, *args, truncation): + prefix = self.prefix if self.prefix is not None else "" + if isinstance(args[0], list): + if self.tokenizer.pad_token_id is None: + raise ValueError("Please make sure that the tokenizer has a pad_token_id when using a batch input") + args = ([prefix + arg for arg in args[0]],) + padding = True + + elif isinstance(args[0], str): + args = (prefix + args[0],) + padding = False + else: + raise TypeError( + f" `args[0]`: {args[0]} have the wrong format. The should be either of type `str` or type `list`" + ) + inputs = self.tokenizer(*args, padding=padding, truncation=truncation, return_tensors=self.framework) + # This is produced by tokenizers but is an invalid generate kwargs + if "token_type_ids" in inputs: + del inputs["token_type_ids"] + return inputs + + def __call__(self, *args: Union[str, list[str]], **kwargs: Any) -> list[dict[str, str]]: + r""" + Generate the output text(s) using text(s) given as inputs. + + Args: + args (`str` or `list[str]`): + Input text for the encoder. + return_tensors (`bool`, *optional*, defaults to `False`): + Whether or not to include the tensors of predictions (as token indices) in the outputs. + return_text (`bool`, *optional*, defaults to `True`): + Whether or not to include the decoded texts in the outputs. + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`): + Whether or not to clean up the potential extra spaces in the text output. + truncation (`TruncationStrategy`, *optional*, defaults to `TruncationStrategy.DO_NOT_TRUNCATE`): + The truncation strategy for the tokenization within the pipeline. `TruncationStrategy.DO_NOT_TRUNCATE` + (default) will never truncate, but it is sometimes desirable to truncate the input to fit the model's + max_length instead of throwing an error down the line. + generate_kwargs: + Additional keyword arguments to pass along to the generate method of the model (see the generate method + corresponding to your framework [here](./text_generation)). + + Return: + A list or a list of list of `dict`: Each result comes as a dictionary with the following keys: + + - **generated_text** (`str`, present when `return_text=True`) -- The generated text. + - **generated_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The token + ids of the generated text. + """ + + result = super().__call__(*args, **kwargs) + if ( + isinstance(args[0], list) + and all(isinstance(el, str) for el in args[0]) + and all(len(res) == 1 for res in result) + ): + return [res[0] for res in result] + return result + + def preprocess(self, inputs, truncation=TruncationStrategy.DO_NOT_TRUNCATE, **kwargs): + inputs = self._parse_and_tokenize(inputs, truncation=truncation, **kwargs) + return inputs + + def _forward(self, model_inputs, **generate_kwargs): + if self.framework == "pt": + in_b, input_length = model_inputs["input_ids"].shape + elif self.framework == "tf": + in_b, input_length = tf.shape(model_inputs["input_ids"]).numpy() + + self.check_inputs( + input_length, + generate_kwargs.get("min_length", self.generation_config.min_length), + generate_kwargs.get("max_length", self.generation_config.max_length), + ) + + # User-defined `generation_config` passed to the pipeline call take precedence + if "generation_config" not in generate_kwargs: + generate_kwargs["generation_config"] = self.generation_config + + output_ids = self.model.generate(**model_inputs, **generate_kwargs) + out_b = output_ids.shape[0] + if self.framework == "pt": + output_ids = output_ids.reshape(in_b, out_b // in_b, *output_ids.shape[1:]) + elif self.framework == "tf": + output_ids = tf.reshape(output_ids, (in_b, out_b // in_b, *output_ids.shape[1:])) + return {"output_ids": output_ids} + + def postprocess(self, model_outputs, return_type=ReturnType.TEXT, clean_up_tokenization_spaces=False): + records = [] + for output_ids in model_outputs["output_ids"][0]: + if return_type == ReturnType.TENSORS: + record = {f"{self.return_name}_token_ids": output_ids} + elif return_type == ReturnType.TEXT: + record = { + f"{self.return_name}_text": self.tokenizer.decode( + output_ids, + skip_special_tokens=True, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + ) + } + records.append(record) + return records + + +@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True)) +class SummarizationPipeline(Text2TextGenerationPipeline): + """ + Summarize news articles and other documents. + + This summarizing pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"summarization"`. + + The models that this pipeline can use are models that have been fine-tuned on a summarization task, which is + currently, '*bart-large-cnn*', '*google-t5/t5-small*', '*google-t5/t5-base*', '*google-t5/t5-large*', '*google-t5/t5-3b*', '*google-t5/t5-11b*'. See the up-to-date + list of available models on [huggingface.co/models](https://huggingface.co/models?filter=summarization). For a list + of available parameters, see the [following + documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.generation.GenerationMixin.generate) + + Unless the model you're using explicitly sets these generation parameters in its configuration files + (`generation_config.json`), the following default values will be used: + - max_new_tokens: 256 + - num_beams: 4 + + Usage: + + ```python + # use bart in pytorch + summarizer = pipeline("summarization") + summarizer("An apple a day, keeps the doctor away", min_length=5, max_length=20) + + # use t5 in tf + summarizer = pipeline("summarization", model="google-t5/t5-base", tokenizer="google-t5/t5-base", framework="tf") + summarizer("An apple a day, keeps the doctor away", min_length=5, max_length=20) + ```""" + + # Used in the return key of the pipeline. + return_name = "summary" + + def __call__(self, *args, **kwargs): + r""" + Summarize the text(s) given as inputs. + + Args: + documents (*str* or `list[str]`): + One or several articles (or one list of articles) to summarize. + return_text (`bool`, *optional*, defaults to `True`): + Whether or not to include the decoded texts in the outputs + return_tensors (`bool`, *optional*, defaults to `False`): + Whether or not to include the tensors of predictions (as token indices) in the outputs. + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`): + Whether or not to clean up the potential extra spaces in the text output. + generate_kwargs: + Additional keyword arguments to pass along to the generate method of the model (see the generate method + corresponding to your framework [here](./text_generation)). + + Return: + A list or a list of list of `dict`: Each result comes as a dictionary with the following keys: + + - **summary_text** (`str`, present when `return_text=True`) -- The summary of the corresponding input. + - **summary_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The token + ids of the summary. + """ + return super().__call__(*args, **kwargs) + + def check_inputs(self, input_length: int, min_length: int, max_length: int) -> bool: + """ + Checks whether there might be something wrong with given input with regard to the model. + """ + if max_length < min_length: + logger.warning(f"Your min_length={min_length} must be inferior than your max_length={max_length}.") + + if input_length < max_length: + logger.warning( + f"Your max_length is set to {max_length}, but your input_length is only {input_length}. Since this is " + "a summarization task, where outputs shorter than the input are typically wanted, you might " + f"consider decreasing max_length manually, e.g. summarizer('...', max_length={input_length // 2})" + ) + + +@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True)) +class TranslationPipeline(Text2TextGenerationPipeline): + """ + Translates from one language to another. + + This translation pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"translation_xx_to_yy"`. + + The models that this pipeline can use are models that have been fine-tuned on a translation task. See the + up-to-date list of available models on [huggingface.co/models](https://huggingface.co/models?filter=translation). + For a list of available parameters, see the [following + documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.generation.GenerationMixin.generate) + + Unless the model you're using explicitly sets these generation parameters in its configuration files + (`generation_config.json`), the following default values will be used: + - max_new_tokens: 256 + - num_beams: 4 + + Usage: + + ```python + en_fr_translator = pipeline("translation_en_to_fr") + en_fr_translator("How old are you?") + ```""" + + # Used in the return key of the pipeline. + return_name = "translation" + + def check_inputs(self, input_length: int, min_length: int, max_length: int): + if input_length > 0.9 * max_length: + logger.warning( + f"Your input_length: {input_length} is bigger than 0.9 * max_length: {max_length}. You might consider " + "increasing your max_length manually, e.g. translator('...', max_length=400)" + ) + return True + + def preprocess(self, *args, truncation=TruncationStrategy.DO_NOT_TRUNCATE, src_lang=None, tgt_lang=None): + if getattr(self.tokenizer, "_build_translation_inputs", None): + return self.tokenizer._build_translation_inputs( + *args, return_tensors=self.framework, truncation=truncation, src_lang=src_lang, tgt_lang=tgt_lang + ) + else: + return super()._parse_and_tokenize(*args, truncation=truncation) + + def _sanitize_parameters(self, src_lang=None, tgt_lang=None, **kwargs): + preprocess_params, forward_params, postprocess_params = super()._sanitize_parameters(**kwargs) + if src_lang is not None: + preprocess_params["src_lang"] = src_lang + if tgt_lang is not None: + preprocess_params["tgt_lang"] = tgt_lang + if src_lang is None and tgt_lang is None: + # Backward compatibility, direct arguments use is preferred. + task = kwargs.get("task", self.task) + items = task.split("_") + if task and len(items) == 4: + # translation, XX, to YY + preprocess_params["src_lang"] = items[1] + preprocess_params["tgt_lang"] = items[3] + return preprocess_params, forward_params, postprocess_params + + def __call__(self, *args, **kwargs): + r""" + Translate the text(s) given as inputs. + + Args: + args (`str` or `list[str]`): + Texts to be translated. + return_tensors (`bool`, *optional*, defaults to `False`): + Whether or not to include the tensors of predictions (as token indices) in the outputs. + return_text (`bool`, *optional*, defaults to `True`): + Whether or not to include the decoded texts in the outputs. + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`): + Whether or not to clean up the potential extra spaces in the text output. + src_lang (`str`, *optional*): + The language of the input. Might be required for multilingual models. Will not have any effect for + single pair translation models + tgt_lang (`str`, *optional*): + The language of the desired output. Might be required for multilingual models. Will not have any effect + for single pair translation models + generate_kwargs: + Additional keyword arguments to pass along to the generate method of the model (see the generate method + corresponding to your framework [here](./text_generation)). + + Return: + A list or a list of list of `dict`: Each result comes as a dictionary with the following keys: + + - **translation_text** (`str`, present when `return_text=True`) -- The translation. + - **translation_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The + token ids of the translation. + """ + return super().__call__(*args, **kwargs) diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/text_classification.py b/venv/lib/python3.13/site-packages/transformers/pipelines/text_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..6f11f3bc97418be6106b183a1925245a273cb085 --- /dev/null +++ b/venv/lib/python3.13/site-packages/transformers/pipelines/text_classification.py @@ -0,0 +1,245 @@ +import inspect +import warnings +from typing import Any, Union + +import numpy as np + +from ..utils import ExplicitEnum, add_end_docstrings, is_tf_available, is_torch_available +from .base import GenericTensor, Pipeline, build_pipeline_init_args + + +if is_tf_available(): + from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES + +if is_torch_available(): + from ..models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES + + +def sigmoid(_outputs): + return 1.0 / (1.0 + np.exp(-_outputs)) + + +def softmax(_outputs): + maxes = np.max(_outputs, axis=-1, keepdims=True) + shifted_exp = np.exp(_outputs - maxes) + return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True) + + +class ClassificationFunction(ExplicitEnum): + SIGMOID = "sigmoid" + SOFTMAX = "softmax" + NONE = "none" + + +@add_end_docstrings( + build_pipeline_init_args(has_tokenizer=True), + r""" + return_all_scores (`bool`, *optional*, defaults to `False`): + Whether to return all prediction scores or just the one of the predicted class. + function_to_apply (`str`, *optional*, defaults to `"default"`): + The function to apply to the model outputs in order to retrieve the scores. Accepts four different values: + + - `"default"`: if the model has a single label, will apply the sigmoid function on the output. If the model + has several labels, will apply the softmax function on the output. In case of regression tasks, will not + apply any function on the output. + - `"sigmoid"`: Applies the sigmoid function on the output. + - `"softmax"`: Applies the softmax function on the output. + - `"none"`: Does not apply any function on the output.""", +) +class TextClassificationPipeline(Pipeline): + """ + Text classification pipeline using any `ModelForSequenceClassification`. See the [sequence classification + examples](../task_summary#sequence-classification) for more information. + + Example: + + ```python + >>> from transformers import pipeline + + >>> classifier = pipeline(model="distilbert/distilbert-base-uncased-finetuned-sst-2-english") + >>> classifier("This movie is disgustingly good !") + [{'label': 'POSITIVE', 'score': 1.0}] + + >>> classifier("Director tried too much.") + [{'label': 'NEGATIVE', 'score': 0.996}] + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + This text classification pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"sentiment-analysis"` (for classifying sequences according to positive or negative sentiments). + + If multiple classification labels are available (`model.config.num_labels >= 2`), the pipeline will run a softmax + over the results. If there is a single label, the pipeline will run a sigmoid over the result. In case of regression + tasks (`model.config.problem_type == "regression"`), will not apply any function on the output. + + The models that this pipeline can use are models that have been fine-tuned on a sequence classification task. See + the up-to-date list of available models on + [huggingface.co/models](https://huggingface.co/models?filter=text-classification). + """ + + _load_processor = False + _load_image_processor = False + _load_feature_extractor = False + _load_tokenizer = True + + return_all_scores = False + function_to_apply = ClassificationFunction.NONE + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + self.check_model_type( + TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES + if self.framework == "tf" + else MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES + ) + + def _sanitize_parameters(self, return_all_scores=None, function_to_apply=None, top_k="", **tokenizer_kwargs): + # Using "" as default argument because we're going to use `top_k=None` in user code to declare + # "No top_k" + preprocess_params = tokenizer_kwargs + + postprocess_params = {} + if hasattr(self.model.config, "return_all_scores") and return_all_scores is None: + return_all_scores = self.model.config.return_all_scores + + if isinstance(top_k, int) or top_k is None: + postprocess_params["top_k"] = top_k + postprocess_params["_legacy"] = False + elif return_all_scores is not None: + warnings.warn( + "`return_all_scores` is now deprecated, if want a similar functionality use `top_k=None` instead of" + " `return_all_scores=True` or `top_k=1` instead of `return_all_scores=False`.", + UserWarning, + ) + if return_all_scores: + postprocess_params["top_k"] = None + else: + postprocess_params["top_k"] = 1 + + if isinstance(function_to_apply, str): + function_to_apply = ClassificationFunction[function_to_apply.upper()] + + if function_to_apply is not None: + postprocess_params["function_to_apply"] = function_to_apply + return preprocess_params, {}, postprocess_params + + def __call__( + self, + inputs: Union[str, list[str], dict[str, str], list[dict[str, str]]], + **kwargs: Any, + ) -> list[dict[str, Any]]: + """ + Classify the text(s) given as inputs. + + Args: + inputs (`str` or `list[str]` or `dict[str]`, or `list[dict[str]]`): + One or several texts to classify. In order to use text pairs for your classification, you can send a + dictionary containing `{"text", "text_pair"}` keys, or a list of those. + top_k (`int`, *optional*, defaults to `1`): + How many results to return. + function_to_apply (`str`, *optional*, defaults to `"default"`): + The function to apply to the model outputs in order to retrieve the scores. Accepts four different + values: + + If this argument is not specified, then it will apply the following functions according to the number + of labels: + + - If problem type is regression, will not apply any function on the output. + - If the model has a single label, will apply the sigmoid function on the output. + - If the model has several labels, will apply the softmax function on the output. + + Possible values are: + + - `"sigmoid"`: Applies the sigmoid function on the output. + - `"softmax"`: Applies the softmax function on the output. + - `"none"`: Does not apply any function on the output. + + Return: + A list of `dict`: Each result comes as list of dictionaries with the following keys: + + - **label** (`str`) -- The label predicted. + - **score** (`float`) -- The corresponding probability. + + If `top_k` is used, one such dictionary is returned per label. + """ + inputs = (inputs,) + result = super().__call__(*inputs, **kwargs) + # TODO try and retrieve it in a nicer way from _sanitize_parameters. + _legacy = "top_k" not in kwargs + if isinstance(inputs[0], str) and _legacy: + # This pipeline is odd, and return a list when single item is run + return [result] + else: + return result + + def preprocess(self, inputs, **tokenizer_kwargs) -> dict[str, GenericTensor]: + return_tensors = self.framework + if isinstance(inputs, dict): + return self.tokenizer(**inputs, return_tensors=return_tensors, **tokenizer_kwargs) + elif isinstance(inputs, list) and len(inputs) == 1 and isinstance(inputs[0], list) and len(inputs[0]) == 2: + # It used to be valid to use a list of list of list for text pairs, keeping this path for BC + return self.tokenizer( + text=inputs[0][0], text_pair=inputs[0][1], return_tensors=return_tensors, **tokenizer_kwargs + ) + elif isinstance(inputs, list): + # This is likely an invalid usage of the pipeline attempting to pass text pairs. + raise ValueError( + "The pipeline received invalid inputs, if you are trying to send text pairs, you can try to send a" + ' dictionary `{"text": "My text", "text_pair": "My pair"}` in order to send a text pair.' + ) + return self.tokenizer(inputs, return_tensors=return_tensors, **tokenizer_kwargs) + + def _forward(self, model_inputs): + # `XXXForSequenceClassification` models should not use `use_cache=True` even if it's supported + model_forward = self.model.forward if self.framework == "pt" else self.model.call + if "use_cache" in inspect.signature(model_forward).parameters: + model_inputs["use_cache"] = False + return self.model(**model_inputs) + + def postprocess(self, model_outputs, function_to_apply=None, top_k=1, _legacy=True): + # `_legacy` is used to determine if we're running the naked pipeline and in backward + # compatibility mode, or if running the pipeline with `pipeline(..., top_k=1)` we're running + # the more natural result containing the list. + # Default value before `set_parameters` + if function_to_apply is None: + if self.model.config.problem_type == "regression": + function_to_apply = ClassificationFunction.NONE + elif self.model.config.problem_type == "multi_label_classification" or self.model.config.num_labels == 1: + function_to_apply = ClassificationFunction.SIGMOID + elif self.model.config.problem_type == "single_label_classification" or self.model.config.num_labels > 1: + function_to_apply = ClassificationFunction.SOFTMAX + elif hasattr(self.model.config, "function_to_apply") and function_to_apply is None: + function_to_apply = self.model.config.function_to_apply + else: + function_to_apply = ClassificationFunction.NONE + + outputs = model_outputs["logits"][0] + + if self.framework == "pt": + # To enable using fp16 and bf16 + outputs = outputs.float().numpy() + else: + outputs = outputs.numpy() + + if function_to_apply == ClassificationFunction.SIGMOID: + scores = sigmoid(outputs) + elif function_to_apply == ClassificationFunction.SOFTMAX: + scores = softmax(outputs) + elif function_to_apply == ClassificationFunction.NONE: + scores = outputs + else: + raise ValueError(f"Unrecognized `function_to_apply` argument: {function_to_apply}") + + if top_k == 1 and _legacy: + return {"label": self.model.config.id2label[scores.argmax().item()], "score": scores.max().item()} + + dict_scores = [ + {"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(scores) + ] + if not _legacy: + dict_scores.sort(key=lambda x: x["score"], reverse=True) + if top_k is not None: + dict_scores = dict_scores[:top_k] + return dict_scores diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/text_generation.py b/venv/lib/python3.13/site-packages/transformers/pipelines/text_generation.py new file mode 100644 index 0000000000000000000000000000000000000000..45ec58b702a21fc207a98e75f0b4a89e10e381a0 --- /dev/null +++ b/venv/lib/python3.13/site-packages/transformers/pipelines/text_generation.py @@ -0,0 +1,546 @@ +import enum +import itertools +import types +from typing import Any, overload + +from ..generation import GenerationConfig +from ..utils import ModelOutput, add_end_docstrings, is_tf_available, is_torch_available +from .base import Pipeline, build_pipeline_init_args + + +if is_torch_available(): + import torch + + from ..models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES + from .pt_utils import KeyDataset + +if is_tf_available(): + import tensorflow as tf + + from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES + +ChatType = list[dict[str, str]] + + +class ReturnType(enum.Enum): + TENSORS = 0 + NEW_TEXT = 1 + FULL_TEXT = 2 + + +class Chat: + """This class is intended to just be used internally in this pipeline and not exposed to users. We convert chats + to this format because the rest of the pipeline code tends to assume that lists of messages are + actually a batch of samples rather than messages in the same conversation.""" + + def __init__(self, messages: dict): + for message in messages: + if not ("role" in message and "content" in message): + raise ValueError("When passing chat dicts as input, each dict must have a 'role' and 'content' key.") + self.messages = messages + + +@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True)) +class TextGenerationPipeline(Pipeline): + """ + Language generation pipeline using any `ModelWithLMHead` or `ModelForCausalLM`. This pipeline predicts the words + that will follow a specified text prompt. When the underlying model is a conversational model, it can also accept + one or more chats, in which case the pipeline will operate in chat mode and will continue the chat(s) by adding + its response(s). Each chat takes the form of a list of dicts, where each dict contains "role" and "content" keys. + + Unless the model you're using explicitly sets these generation parameters in its configuration files + (`generation_config.json`), the following default values will be used: + - max_new_tokens: 256 + - do_sample: True + - temperature: 0.7 + + Examples: + + ```python + >>> from transformers import pipeline + + >>> generator = pipeline(model="openai-community/gpt2") + >>> generator("I can't believe you did such a ", do_sample=False) + [{'generated_text': "I can't believe you did such a icky thing to me. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I"}] + + >>> # These parameters will return suggestions, and only the newly created text making it easier for prompting suggestions. + >>> outputs = generator("My tart needs some", num_return_sequences=4, return_full_text=False) + ``` + + ```python + >>> from transformers import pipeline + + >>> generator = pipeline(model="HuggingFaceH4/zephyr-7b-beta") + >>> # Zephyr-beta is a conversational model, so let's pass it a chat instead of a single string + >>> generator([{"role": "user", "content": "What is the capital of France? Answer in one word."}], do_sample=False, max_new_tokens=2) + [{'generated_text': [{'role': 'user', 'content': 'What is the capital of France? Answer in one word.'}, {'role': 'assistant', 'content': 'Paris'}]}] + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial). You can pass text + generation parameters to this pipeline to control stopping criteria, decoding strategy, and more. Learn more about + text generation parameters in [Text generation strategies](../generation_strategies) and [Text + generation](text_generation). + + This language generation pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"text-generation"`. + + The models that this pipeline can use are models that have been trained with an autoregressive language modeling + objective. See the list of available [text completion models](https://huggingface.co/models?filter=text-generation) + and the list of [conversational models](https://huggingface.co/models?other=conversational) + on [huggingface.co/models]. + """ + + # Prefix text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia + # in https://github.com/rusiaaman/XLNet-gen#methodology + # and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e + + XL_PREFIX = """ + In 1991, the remains of Russian Tsar Nicholas II and his family (except for Alexei and Maria) are discovered. The + voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the remainder of the story. 1883 Western + Siberia, a young Grigori Rasputin is asked by his father and a group of men to perform magic. Rasputin has a vision + and denounces one of the men as a horse thief. Although his father initially slaps him for making such an + accusation, Rasputin watches as the man is chased outside and beaten. Twenty years later, Rasputin sees a vision of + the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous, with people, even a bishop, + begging for his blessing. + """ + + _pipeline_calls_generate = True + _load_processor = False + _load_image_processor = False + _load_feature_extractor = False + _load_tokenizer = True + + # Make sure the docstring is updated when the default generation config is changed + _default_generation_config = GenerationConfig( + max_new_tokens=256, + do_sample=True, # free-form text generation often uses sampling + temperature=0.7, + ) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.check_model_type( + TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES if self.framework == "tf" else MODEL_FOR_CAUSAL_LM_MAPPING_NAMES + ) + if "prefix" not in self._preprocess_params: + # This is very specific. The logic is quite complex and needs to be done + # as a "default". + # It also defines both some preprocess_kwargs and generate_kwargs + # which is why we cannot put them in their respective methods. + prefix = None + if self.prefix is not None: + prefix = self.prefix + if prefix is None and self.model.__class__.__name__ in [ + "XLNetLMHeadModel", + "TransfoXLLMHeadModel", + "TFXLNetLMHeadModel", + "TFTransfoXLLMHeadModel", + ]: + # For XLNet and TransformerXL we add an article to the prompt to give more state to the model. + prefix = self.XL_PREFIX + if prefix is not None: + # Recalculate some generate_kwargs linked to prefix. + preprocess_params, forward_params, _ = self._sanitize_parameters(prefix=prefix, **self._forward_params) + self._preprocess_params = {**self._preprocess_params, **preprocess_params} + self._forward_params = {**self._forward_params, **forward_params} + + def _sanitize_parameters( + self, + return_full_text=None, + return_tensors=None, + return_text=None, + return_type=None, + clean_up_tokenization_spaces=None, + prefix=None, + handle_long_generation=None, + stop_sequence=None, + truncation=None, + max_length=None, + continue_final_message=None, + skip_special_tokens=None, + tokenizer_encode_kwargs=None, + **generate_kwargs, + ): + # preprocess kwargs + preprocess_params = {} + add_special_tokens = False + if "add_special_tokens" in generate_kwargs: + add_special_tokens = preprocess_params["add_special_tokens"] = generate_kwargs.pop("add_special_tokens") + + if "padding" in generate_kwargs: + preprocess_params["padding"] = generate_kwargs.pop("padding") + + if truncation is not None: + preprocess_params["truncation"] = truncation + + if max_length is not None: + preprocess_params["max_length"] = max_length + generate_kwargs["max_length"] = max_length + + if prefix is not None: + preprocess_params["prefix"] = prefix + if prefix: + prefix_inputs = self.tokenizer( + prefix, padding=False, add_special_tokens=add_special_tokens, return_tensors=self.framework + ) + generate_kwargs["prefix_length"] = prefix_inputs["input_ids"].shape[-1] + + if handle_long_generation is not None: + if handle_long_generation != "hole": + raise ValueError( + f"{handle_long_generation} is not a valid value for `handle_long_generation` parameter expected" + " [None, 'hole']" + ) + preprocess_params["handle_long_generation"] = handle_long_generation + + if continue_final_message is not None: + preprocess_params["continue_final_message"] = continue_final_message + + if tokenizer_encode_kwargs is not None: + preprocess_params["tokenizer_encode_kwargs"] = tokenizer_encode_kwargs + + preprocess_params.update(generate_kwargs) + + # forward kwargs + if stop_sequence is not None: + stop_sequence_ids = self.tokenizer.encode(stop_sequence, add_special_tokens=False) + generate_kwargs["eos_token_id"] = stop_sequence_ids + forward_params = generate_kwargs + if self.assistant_model is not None: + forward_params["assistant_model"] = self.assistant_model + if self.assistant_tokenizer is not None: + forward_params["tokenizer"] = self.tokenizer + forward_params["assistant_tokenizer"] = self.assistant_tokenizer + + # postprocess kwargs + postprocess_params = {} + if return_full_text is not None and return_type is None: + if return_text is not None: + raise ValueError("`return_text` is mutually exclusive with `return_full_text`") + if return_tensors is not None: + raise ValueError("`return_full_text` is mutually exclusive with `return_tensors`") + return_type = ReturnType.FULL_TEXT if return_full_text else ReturnType.NEW_TEXT + if return_tensors is not None and return_type is None: + if return_text is not None: + raise ValueError("`return_text` is mutually exclusive with `return_tensors`") + return_type = ReturnType.TENSORS + if return_type is not None: + postprocess_params["return_type"] = return_type + if clean_up_tokenization_spaces is not None: + postprocess_params["clean_up_tokenization_spaces"] = clean_up_tokenization_spaces + if continue_final_message is not None: + postprocess_params["continue_final_message"] = continue_final_message + if skip_special_tokens is not None: + postprocess_params["skip_special_tokens"] = skip_special_tokens + + return preprocess_params, forward_params, postprocess_params + + # overriding _parse_and_tokenize to allow for unusual language-modeling tokenizer arguments + def _parse_and_tokenize(self, *args, **kwargs): + """ + Parse arguments and tokenize + """ + # Parse arguments + if self.model.__class__.__name__ == "TransfoXLLMHeadModel": + kwargs.update({"add_space_before_punct_symbol": True}) + + return super()._parse_and_tokenize(*args, **kwargs) + + @overload + def __call__(self, text_inputs: str, **kwargs: Any) -> list[dict[str, str]]: ... + + @overload + def __call__(self, text_inputs: list[str], **kwargs: Any) -> list[list[dict[str, str]]]: ... + + @overload + def __call__(self, text_inputs: ChatType, **kwargs: Any) -> list[dict[str, ChatType]]: ... + + @overload + def __call__(self, text_inputs: list[ChatType], **kwargs: Any) -> list[list[dict[str, ChatType]]]: ... + + def __call__(self, text_inputs, **kwargs): + """ + Complete the prompt(s) given as inputs. + + Args: + text_inputs (`str`, `list[str]`, list[dict[str, str]], or `list[list[dict[str, str]]]`): + One or several prompts (or one list of prompts) to complete. If strings or a list of string are + passed, this pipeline will continue each prompt. Alternatively, a "chat", in the form of a list + of dicts with "role" and "content" keys, can be passed, or a list of such chats. When chats are passed, + the model's chat template will be used to format them before passing them to the model. + return_tensors (`bool`, *optional*, defaults to `False`): + Returns the tensors of predictions (as token indices) in the outputs. If set to + `True`, the decoded text is not returned. + return_text (`bool`, *optional*): + Returns the decoded texts in the outputs. + return_full_text (`bool`, *optional*, defaults to `True`): + If set to `False` only added text is returned, otherwise the full text is returned. Cannot be + specified at the same time as `return_text`. + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): + Whether or not to clean up the potential extra spaces in the text output. + continue_final_message( `bool`, *optional*): This indicates that you want the model to continue the + last message in the input chat rather than starting a new one, allowing you to "prefill" its response. + By default this is `True` when the final message in the input chat has the `assistant` role and + `False` otherwise, but you can manually override that behaviour by setting this flag. + prefix (`str`, *optional*): + Prefix added to prompt. + handle_long_generation (`str`, *optional*): + By default, this pipelines does not handle long generation (ones that exceed in one form or the other + the model maximum length). There is no perfect way to address this (more info + :https://github.com/huggingface/transformers/issues/14033#issuecomment-948385227). This provides common + strategies to work around that problem depending on your use case. + + - `None` : default strategy where nothing in particular happens + - `"hole"`: Truncates left of input, and leaves a gap wide enough to let generation happen (might + truncate a lot of the prompt and not suitable when generation exceed the model capacity) + tokenizer_encode_kwargs (`dict`, *optional*): + Additional keyword arguments to pass along to the encoding step of the tokenizer. If the text input is + a chat, it is passed to `apply_chat_template`. Otherwise, it is passed to `__call__`. + generate_kwargs (`dict`, *optional*): + Additional keyword arguments to pass along to the generate method of the model (see the generate method + corresponding to your framework [here](./text_generation)). + + Return: + A list or a list of lists of `dict`: Returns one of the following dictionaries (cannot return a combination + of both `generated_text` and `generated_token_ids`): + + - **generated_text** (`str`, present when `return_text=True`) -- The generated text. + - **generated_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The token + ids of the generated text. + """ + if isinstance( + text_inputs, + (list, tuple, types.GeneratorType, KeyDataset) + if is_torch_available() + else (list, tuple, types.GeneratorType), + ): + if isinstance(text_inputs, types.GeneratorType): + text_inputs, _ = itertools.tee(text_inputs) + text_inputs, first_item = (x for x in text_inputs), next(_) + else: + first_item = text_inputs[0] + if isinstance(first_item, (list, tuple, dict)): + # We have one or more prompts in list-of-dicts format, so this is chat mode + if isinstance(first_item, dict): + return super().__call__(Chat(text_inputs), **kwargs) + else: + chats = (Chat(chat) for chat in text_inputs) # 🐈 🐈 🐈 + if isinstance(text_inputs, types.GeneratorType): + return super().__call__(chats, **kwargs) + else: + return super().__call__(list(chats), **kwargs) + return super().__call__(text_inputs, **kwargs) + + def preprocess( + self, + prompt_text, + prefix="", + handle_long_generation=None, + add_special_tokens=None, + truncation=None, + padding=None, + max_length=None, + continue_final_message=None, + tokenizer_encode_kwargs=None, + **generate_kwargs, + ): + # Only set non-None tokenizer kwargs, so as to rely on the tokenizer's defaults + tokenizer_kwargs = { + "add_special_tokens": add_special_tokens, + "truncation": truncation, + "padding": padding, + "max_length": max_length, # NOTE: `max_length` is also a `generate` arg. Use `tokenizer_encode_kwargs` to avoid a name clash + } + tokenizer_kwargs = {key: value for key, value in tokenizer_kwargs.items() if value is not None} + tokenizer_kwargs.update(tokenizer_encode_kwargs or {}) + + if isinstance(prompt_text, Chat): + tokenizer_kwargs.pop("add_special_tokens", None) # ignore add_special_tokens on chats + # If the user passes a chat that ends in an assistant message, we treat it as a prefill by default + # because very few models support multiple separate, consecutive assistant messages + if continue_final_message is None: + continue_final_message = prompt_text.messages[-1]["role"] == "assistant" + inputs = self.tokenizer.apply_chat_template( + prompt_text.messages, + add_generation_prompt=not continue_final_message, + continue_final_message=continue_final_message, + return_dict=True, + return_tensors=self.framework, + **tokenizer_kwargs, + ) + else: + inputs = self.tokenizer(prefix + prompt_text, return_tensors=self.framework, **tokenizer_kwargs) + + inputs["prompt_text"] = prompt_text + + if handle_long_generation == "hole": + cur_len = inputs["input_ids"].shape[-1] + if "max_new_tokens" in generate_kwargs: + new_tokens = generate_kwargs["max_new_tokens"] + else: + new_tokens = generate_kwargs.get("max_length", self.generation_config.max_length) - cur_len + if new_tokens < 0: + raise ValueError("We cannot infer how many new tokens are expected") + if cur_len + new_tokens > self.tokenizer.model_max_length: + keep_length = self.tokenizer.model_max_length - new_tokens + if keep_length <= 0: + raise ValueError( + "We cannot use `hole` to handle this generation the number of desired tokens exceeds the" + " models max length" + ) + + inputs["input_ids"] = inputs["input_ids"][:, -keep_length:] + if "attention_mask" in inputs: + inputs["attention_mask"] = inputs["attention_mask"][:, -keep_length:] + + return inputs + + def _forward(self, model_inputs, **generate_kwargs): + input_ids = model_inputs["input_ids"] + attention_mask = model_inputs.get("attention_mask", None) + # Allow empty prompts + if input_ids.shape[1] == 0: + input_ids = None + attention_mask = None + in_b = 1 + else: + in_b = input_ids.shape[0] + prompt_text = model_inputs.pop("prompt_text") + + # If there is a prefix, we may need to adjust the generation length. Do so without permanently modifying + # generate_kwargs, as some of the parameterization may come from the initialization of the pipeline. + prefix_length = generate_kwargs.pop("prefix_length", 0) + if prefix_length > 0: + has_max_new_tokens = "max_new_tokens" in generate_kwargs or ( + "generation_config" in generate_kwargs + and generate_kwargs["generation_config"].max_new_tokens is not None + ) + if not has_max_new_tokens: + generate_kwargs["max_length"] = generate_kwargs.get("max_length") or self.generation_config.max_length + generate_kwargs["max_length"] += prefix_length + has_min_new_tokens = "min_new_tokens" in generate_kwargs or ( + "generation_config" in generate_kwargs + and generate_kwargs["generation_config"].min_new_tokens is not None + ) + if not has_min_new_tokens and "min_length" in generate_kwargs: + generate_kwargs["min_length"] += prefix_length + + # User-defined `generation_config` passed to the pipeline call take precedence + if "generation_config" not in generate_kwargs: + generate_kwargs["generation_config"] = self.generation_config + + output = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, **generate_kwargs) + + if isinstance(output, ModelOutput): + generated_sequence = output.sequences + other_outputs = {k: v for k, v in output.items() if k not in {"sequences", "past_key_values"}} + out_b = generated_sequence.shape[0] + + if self.framework == "pt": + for key, value in other_outputs.items(): + if isinstance(value, torch.Tensor) and value.shape[0] == out_b: + other_outputs[key] = value.reshape(in_b, out_b // in_b, *value.shape[1:]) + if isinstance(value, tuple) and len(value[0]) == out_b: + value = torch.stack(value).swapaxes(0, 1) + other_outputs[key] = value + elif self.framework == "tf": + for key, value in other_outputs.items(): + if isinstance(value, tf.Tensor) and value.shape[0] == out_b: + other_outputs[key] = tf.reshape(value, (in_b, out_b // in_b, *value.shape[1:])) + if isinstance(value, tuple) and len(value[0]) == out_b: + value = tf.stack(value).swapaxes(0, 1) + other_outputs[key] = value + else: + generated_sequence = output + other_outputs = {} + + out_b = generated_sequence.shape[0] + if self.framework == "pt": + generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:]) + elif self.framework == "tf": + generated_sequence = tf.reshape(generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:])) + + model_outputs = { + "generated_sequence": generated_sequence, + "input_ids": input_ids, + "prompt_text": prompt_text, + } + if other_outputs: + model_outputs.update({"additional_outputs": other_outputs}) + return model_outputs + + def postprocess( + self, + model_outputs, + return_type=ReturnType.FULL_TEXT, + clean_up_tokenization_spaces=True, + continue_final_message=None, + skip_special_tokens=None, + ): + generated_sequence = model_outputs["generated_sequence"][0] + input_ids = model_outputs["input_ids"] + prompt_text = model_outputs["prompt_text"] + generated_sequence = generated_sequence.numpy().tolist() + records = [] + other_outputs = model_outputs.get("additional_outputs", {}) + split_keys = {} + if other_outputs: + if self.framework == "pt": + for k, v in other_outputs.items(): + if isinstance(v, torch.Tensor) and v.shape[0] == len(generated_sequence): + split_keys[k] = v.numpy().tolist() + elif self.framework == "tf": + for k, v in other_outputs.items(): + if isinstance(v, tf.Tensor) and v.shape[0] == len(generated_sequence): + split_keys[k] = v.numpy().tolist() + + skip_special_tokens = skip_special_tokens if skip_special_tokens is not None else True + for idx, sequence in enumerate(generated_sequence): + if return_type == ReturnType.TENSORS: + record = {"generated_token_ids": sequence} + elif return_type in {ReturnType.NEW_TEXT, ReturnType.FULL_TEXT}: + # Decode text + text = self.tokenizer.decode( + sequence, + skip_special_tokens=skip_special_tokens, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + ) + + # Remove PADDING prompt of the sequence if XLNet or Transfo-XL model is used + if input_ids is None: + prompt_length = 0 + else: + prompt_length = len( + self.tokenizer.decode( + input_ids[0], + skip_special_tokens=skip_special_tokens, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + ) + ) + + all_text = text[prompt_length:] + if return_type == ReturnType.FULL_TEXT: + if isinstance(prompt_text, str): + all_text = prompt_text + all_text + elif isinstance(prompt_text, Chat): + if continue_final_message is None: + # If the user passes a chat ending in an assistant message, we treat it as a prefill by + # default because very few models support multiple separate, consecutive assistant messages + continue_final_message = prompt_text.messages[-1]["role"] == "assistant" + if continue_final_message: + # With assistant prefill, concat onto the end of the last message + all_text = list(prompt_text.messages)[:-1] + [ + { + "role": prompt_text.messages[-1]["role"], + "content": prompt_text.messages[-1]["content"] + all_text, + } + ] + else: + # When we're not starting from a prefill, the output is a new assistant message + all_text = list(prompt_text.messages) + [{"role": "assistant", "content": all_text}] + record = {"generated_text": all_text} + for key, values in split_keys.items(): + record[key] = values[idx] + records.append(record) + + return records diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/text_to_audio.py b/venv/lib/python3.13/site-packages/transformers/pipelines/text_to_audio.py new file mode 100644 index 0000000000000000000000000000000000000000..17eaba1466b34811e894064259e2045d330d6b9d --- /dev/null +++ b/venv/lib/python3.13/site-packages/transformers/pipelines/text_to_audio.py @@ -0,0 +1,277 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License.from typing import List, Union +from typing import Any, Union, overload + +from ..generation import GenerationConfig +from ..utils import is_torch_available +from .base import Pipeline + + +if is_torch_available(): + import torch + + from ..models.auto.modeling_auto import MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING + from ..models.speecht5.modeling_speecht5 import SpeechT5HifiGan + +DEFAULT_VOCODER_ID = "microsoft/speecht5_hifigan" + + +class TextToAudioPipeline(Pipeline): + """ + Text-to-audio generation pipeline using any `AutoModelForTextToWaveform` or `AutoModelForTextToSpectrogram`. This + pipeline generates an audio file from an input text and optional other conditional inputs. + + Unless the model you're using explicitly sets these generation parameters in its configuration files + (`generation_config.json`), the following default values will be used: + - max_new_tokens: 256 + + Example: + + ```python + >>> from transformers import pipeline + + >>> pipe = pipeline(model="suno/bark-small") + >>> output = pipe("Hey it's HuggingFace on the phone!") + + >>> audio = output["audio"] + >>> sampling_rate = output["sampling_rate"] + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + + + You can specify parameters passed to the model by using [`TextToAudioPipeline.__call__.forward_params`] or + [`TextToAudioPipeline.__call__.generate_kwargs`]. + + Example: + + ```python + >>> from transformers import pipeline + + >>> music_generator = pipeline(task="text-to-audio", model="facebook/musicgen-small", framework="pt") + + >>> # diversify the music generation by adding randomness with a high temperature and set a maximum music length + >>> generate_kwargs = { + ... "do_sample": True, + ... "temperature": 0.7, + ... "max_new_tokens": 35, + ... } + + >>> outputs = music_generator("Techno music with high melodic riffs", generate_kwargs=generate_kwargs) + ``` + + + + This pipeline can currently be loaded from [`pipeline`] using the following task identifiers: `"text-to-speech"` or + `"text-to-audio"`. + + See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=text-to-speech). + """ + + # Introducing the processor at load time for new behaviour + _load_processor = True + + _pipeline_calls_generate = True + _load_processor = False + _load_image_processor = False + _load_feature_extractor = False + _load_tokenizer = True + + # Make sure the docstring is updated when the default generation config is changed + _default_generation_config = GenerationConfig( + max_new_tokens=256, + ) + + def __init__(self, *args, vocoder=None, sampling_rate=None, no_processor=True, **kwargs): + super().__init__(*args, **kwargs) + + # Legacy behaviour just uses the tokenizer while new models use the processor as a whole at any given time + self.no_processor = no_processor + + if self.framework == "tf": + raise ValueError("The TextToAudioPipeline is only available in PyTorch.") + + self.vocoder = None + if self.model.__class__ in MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING.values(): + self.vocoder = ( + SpeechT5HifiGan.from_pretrained(DEFAULT_VOCODER_ID).to(self.model.device) + if vocoder is None + else vocoder + ) + + self.sampling_rate = sampling_rate + if self.vocoder is not None: + self.sampling_rate = self.vocoder.config.sampling_rate + + if self.sampling_rate is None: + # get sampling_rate from config and generation config + + config = self.model.config + gen_config = self.model.__dict__.get("generation_config", None) + if gen_config is not None: + config.update(gen_config.to_dict()) + + for sampling_rate_name in ["sample_rate", "sampling_rate"]: + sampling_rate = getattr(config, sampling_rate_name, None) + if sampling_rate is not None: + self.sampling_rate = sampling_rate + elif getattr(config, "codec_config", None) is not None: + sampling_rate = getattr(config.codec_config, sampling_rate_name, None) + if sampling_rate is not None: + self.sampling_rate = sampling_rate + + # last fallback to get the sampling rate based on processor + if self.sampling_rate is None and not self.no_processor and hasattr(self.processor, "feature_extractor"): + self.sampling_rate = self.processor.feature_extractor.sampling_rate + + def preprocess(self, text, **kwargs): + if isinstance(text, str): + text = [text] + + if self.model.config.model_type == "bark": + # bark Tokenizer is called with BarkProcessor which uses those kwargs + new_kwargs = { + "max_length": self.generation_config.semantic_config.get("max_input_semantic_length", 256), + "add_special_tokens": False, + "return_attention_mask": True, + "return_token_type_ids": False, + "padding": "max_length", + } + + # priority is given to kwargs + new_kwargs.update(kwargs) + + kwargs = new_kwargs + + preprocessor = self.tokenizer if self.no_processor else self.processor + output = preprocessor(text, **kwargs, return_tensors="pt") + + return output + + def _forward(self, model_inputs, **kwargs): + # we expect some kwargs to be additional tensors which need to be on the right device + kwargs = self._ensure_tensor_on_device(kwargs, device=self.device) + forward_params = kwargs["forward_params"] + generate_kwargs = kwargs["generate_kwargs"] + + if self.model.can_generate(): + # we expect some kwargs to be additional tensors which need to be on the right device + generate_kwargs = self._ensure_tensor_on_device(generate_kwargs, device=self.device) + + # User-defined `generation_config` passed to the pipeline call take precedence + if "generation_config" not in generate_kwargs: + generate_kwargs["generation_config"] = self.generation_config + + # generate_kwargs get priority over forward_params + forward_params.update(generate_kwargs) + + output = self.model.generate(**model_inputs, **forward_params) + else: + if len(generate_kwargs): + raise ValueError( + "You're using the `TextToAudioPipeline` with a forward-only model, but `generate_kwargs` is non " + "empty. For forward-only TTA models, please use `forward_params` instead of `generate_kwargs`. " + f"For reference, the `generate_kwargs` used here are: {generate_kwargs.keys()}" + ) + output = self.model(**model_inputs, **forward_params)[0] + + if self.vocoder is not None: + # in that case, the output is a spectrogram that needs to be converted into a waveform + output = self.vocoder(output) + + return output + + @overload + def __call__(self, text_inputs: str, **forward_params: Any) -> dict[str, Any]: ... + + @overload + def __call__(self, text_inputs: list[str], **forward_params: Any) -> list[dict[str, Any]]: ... + + def __call__( + self, text_inputs: Union[str, list[str]], **forward_params + ) -> Union[dict[str, Any], list[dict[str, Any]]]: + """ + Generates speech/audio from the inputs. See the [`TextToAudioPipeline`] documentation for more information. + + Args: + text_inputs (`str` or `list[str]`): + The text(s) to generate. + forward_params (`dict`, *optional*): + Parameters passed to the model generation/forward method. `forward_params` are always passed to the + underlying model. + generate_kwargs (`dict`, *optional*): + The dictionary of ad-hoc parametrization of `generate_config` to be used for the generation call. For a + complete overview of generate, check the [following + guide](https://huggingface.co/docs/transformers/en/main_classes/text_generation). `generate_kwargs` are + only passed to the underlying model if the latter is a generative model. + + Return: + A `dict` or a list of `dict`: The dictionaries have two keys: + + - **audio** (`np.ndarray` of shape `(nb_channels, audio_length)`) -- The generated audio waveform. + - **sampling_rate** (`int`) -- The sampling rate of the generated audio waveform. + """ + return super().__call__(text_inputs, **forward_params) + + def _sanitize_parameters( + self, + preprocess_params=None, + forward_params=None, + generate_kwargs=None, + ): + if getattr(self, "assistant_model", None) is not None: + generate_kwargs["assistant_model"] = self.assistant_model + if getattr(self, "assistant_tokenizer", None) is not None: + generate_kwargs["tokenizer"] = self.tokenizer + generate_kwargs["assistant_tokenizer"] = self.assistant_tokenizer + + params = { + "forward_params": forward_params if forward_params else {}, + "generate_kwargs": generate_kwargs if generate_kwargs else {}, + } + + if preprocess_params is None: + preprocess_params = {} + postprocess_params = {} + + return preprocess_params, params, postprocess_params + + def postprocess(self, audio): + output_dict = {} + + if self.model.config.model_type == "csm": + waveform_key = "audio" + else: + waveform_key = "waveform" + + # We directly get the waveform + if self.no_processor: + if isinstance(audio, dict): + waveform = audio[waveform_key] + elif isinstance(audio, tuple): + waveform = audio[0] + else: + waveform = audio + # Or we need to postprocess to get the waveform + else: + waveform = self.processor.decode(audio) + + if isinstance(audio, list): + output_dict["audio"] = [el.to(device="cpu", dtype=torch.float).numpy() for el in waveform] + else: + output_dict["audio"] = waveform.to(device="cpu", dtype=torch.float).numpy() + output_dict["sampling_rate"] = self.sampling_rate + + return output_dict diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/token_classification.py b/venv/lib/python3.13/site-packages/transformers/pipelines/token_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..31ba1c481107885089709f9e4a52054d26c7aca8 --- /dev/null +++ b/venv/lib/python3.13/site-packages/transformers/pipelines/token_classification.py @@ -0,0 +1,668 @@ +import types +import warnings +from typing import Any, Optional, Union, overload + +import numpy as np + +from ..models.bert.tokenization_bert import BasicTokenizer +from ..utils import ( + ExplicitEnum, + add_end_docstrings, + is_tf_available, + is_torch_available, +) +from .base import ArgumentHandler, ChunkPipeline, Dataset, build_pipeline_init_args + + +if is_tf_available(): + import tensorflow as tf + + from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES +if is_torch_available(): + import torch + + from ..models.auto.modeling_auto import MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES + + +class TokenClassificationArgumentHandler(ArgumentHandler): + """ + Handles arguments for token classification. + """ + + def __call__(self, inputs: Union[str, list[str]], **kwargs): + is_split_into_words = kwargs.get("is_split_into_words", False) + delimiter = kwargs.get("delimiter") + + if inputs is not None and isinstance(inputs, (list, tuple)) and len(inputs) > 0: + inputs = list(inputs) + batch_size = len(inputs) + elif isinstance(inputs, str): + inputs = [inputs] + batch_size = 1 + elif Dataset is not None and isinstance(inputs, Dataset) or isinstance(inputs, types.GeneratorType): + return inputs, is_split_into_words, None, delimiter + else: + raise ValueError("At least one input is required.") + + offset_mapping = kwargs.get("offset_mapping") + if offset_mapping: + if isinstance(offset_mapping, list) and isinstance(offset_mapping[0], tuple): + offset_mapping = [offset_mapping] + if len(offset_mapping) != batch_size: + raise ValueError("offset_mapping should have the same batch size as the input") + return inputs, is_split_into_words, offset_mapping, delimiter + + +class AggregationStrategy(ExplicitEnum): + """All the valid aggregation strategies for TokenClassificationPipeline""" + + NONE = "none" + SIMPLE = "simple" + FIRST = "first" + AVERAGE = "average" + MAX = "max" + + +@add_end_docstrings( + build_pipeline_init_args(has_tokenizer=True), + r""" + ignore_labels (`list[str]`, defaults to `["O"]`): + A list of labels to ignore. + grouped_entities (`bool`, *optional*, defaults to `False`): + DEPRECATED, use `aggregation_strategy` instead. Whether or not to group the tokens corresponding to the + same entity together in the predictions or not. + stride (`int`, *optional*): + If stride is provided, the pipeline is applied on all the text. The text is split into chunks of size + model_max_length. Works only with fast tokenizers and `aggregation_strategy` different from `NONE`. The + value of this argument defines the number of overlapping tokens between chunks. In other words, the model + will shift forward by `tokenizer.model_max_length - stride` tokens each step. + aggregation_strategy (`str`, *optional*, defaults to `"none"`): + The strategy to fuse (or not) tokens based on the model prediction. + + - "none" : Will simply not do any aggregation and simply return raw results from the model + - "simple" : Will attempt to group entities following the default schema. (A, B-TAG), (B, I-TAG), (C, + I-TAG), (D, B-TAG2) (E, B-TAG2) will end up being [{"word": ABC, "entity": "TAG"}, {"word": "D", + "entity": "TAG2"}, {"word": "E", "entity": "TAG2"}] Notice that two consecutive B tags will end up as + different entities. On word based languages, we might end up splitting words undesirably : Imagine + Microsoft being tagged as [{"word": "Micro", "entity": "ENTERPRISE"}, {"word": "soft", "entity": + "NAME"}]. Look for FIRST, MAX, AVERAGE for ways to mitigate that and disambiguate words (on languages + that support that meaning, which is basically tokens separated by a space). These mitigations will + only work on real words, "New york" might still be tagged with two different entities. + - "first" : (works only on word based models) Will use the `SIMPLE` strategy except that words, cannot + end up with different tags. Words will simply use the tag of the first token of the word when there + is ambiguity. + - "average" : (works only on word based models) Will use the `SIMPLE` strategy except that words, + cannot end up with different tags. scores will be averaged first across tokens, and then the maximum + label is applied. + - "max" : (works only on word based models) Will use the `SIMPLE` strategy except that words, cannot + end up with different tags. Word entity will simply be the token with the maximum score.""", +) +class TokenClassificationPipeline(ChunkPipeline): + """ + Named Entity Recognition pipeline using any `ModelForTokenClassification`. See the [named entity recognition + examples](../task_summary#named-entity-recognition) for more information. + + Example: + + ```python + >>> from transformers import pipeline + + >>> token_classifier = pipeline(model="Jean-Baptiste/camembert-ner", aggregation_strategy="simple") + >>> sentence = "Je m'appelle jean-baptiste et je vis à montréal" + >>> tokens = token_classifier(sentence) + >>> tokens + [{'entity_group': 'PER', 'score': 0.9931, 'word': 'jean-baptiste', 'start': 12, 'end': 26}, {'entity_group': 'LOC', 'score': 0.998, 'word': 'montréal', 'start': 38, 'end': 47}] + + >>> token = tokens[0] + >>> # Start and end provide an easy way to highlight words in the original text. + >>> sentence[token["start"] : token["end"]] + ' jean-baptiste' + + >>> # Some models use the same idea to do part of speech. + >>> syntaxer = pipeline(model="vblagoje/bert-english-uncased-finetuned-pos", aggregation_strategy="simple") + >>> syntaxer("My name is Sarah and I live in London") + [{'entity_group': 'PRON', 'score': 0.999, 'word': 'my', 'start': 0, 'end': 2}, {'entity_group': 'NOUN', 'score': 0.997, 'word': 'name', 'start': 3, 'end': 7}, {'entity_group': 'AUX', 'score': 0.994, 'word': 'is', 'start': 8, 'end': 10}, {'entity_group': 'PROPN', 'score': 0.999, 'word': 'sarah', 'start': 11, 'end': 16}, {'entity_group': 'CCONJ', 'score': 0.999, 'word': 'and', 'start': 17, 'end': 20}, {'entity_group': 'PRON', 'score': 0.999, 'word': 'i', 'start': 21, 'end': 22}, {'entity_group': 'VERB', 'score': 0.998, 'word': 'live', 'start': 23, 'end': 27}, {'entity_group': 'ADP', 'score': 0.999, 'word': 'in', 'start': 28, 'end': 30}, {'entity_group': 'PROPN', 'score': 0.999, 'word': 'london', 'start': 31, 'end': 37}] + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + This token recognition pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"ner"` (for predicting the classes of tokens in a sequence: person, organisation, location or miscellaneous). + + The models that this pipeline can use are models that have been fine-tuned on a token classification task. See the + up-to-date list of available models on + [huggingface.co/models](https://huggingface.co/models?filter=token-classification). + """ + + default_input_names = "sequences" + + _load_processor = False + _load_image_processor = False + _load_feature_extractor = False + _load_tokenizer = True + + def __init__(self, args_parser=TokenClassificationArgumentHandler(), **kwargs): + super().__init__(**kwargs) + + self.check_model_type( + TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES + if self.framework == "tf" + else MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES + ) + + self._basic_tokenizer = BasicTokenizer(do_lower_case=False) + self._args_parser = args_parser + + def _sanitize_parameters( + self, + ignore_labels=None, + grouped_entities: Optional[bool] = None, + ignore_subwords: Optional[bool] = None, + aggregation_strategy: Optional[AggregationStrategy] = None, + offset_mapping: Optional[list[tuple[int, int]]] = None, + is_split_into_words: bool = False, + stride: Optional[int] = None, + delimiter: Optional[str] = None, + ): + preprocess_params = {} + preprocess_params["is_split_into_words"] = is_split_into_words + + if is_split_into_words: + preprocess_params["delimiter"] = " " if delimiter is None else delimiter + + if offset_mapping is not None: + preprocess_params["offset_mapping"] = offset_mapping + + postprocess_params = {} + if grouped_entities is not None or ignore_subwords is not None: + if grouped_entities and ignore_subwords: + aggregation_strategy = AggregationStrategy.FIRST + elif grouped_entities and not ignore_subwords: + aggregation_strategy = AggregationStrategy.SIMPLE + else: + aggregation_strategy = AggregationStrategy.NONE + + if grouped_entities is not None: + warnings.warn( + "`grouped_entities` is deprecated and will be removed in version v5.0.0, defaulted to" + f' `aggregation_strategy="{aggregation_strategy}"` instead.' + ) + if ignore_subwords is not None: + warnings.warn( + "`ignore_subwords` is deprecated and will be removed in version v5.0.0, defaulted to" + f' `aggregation_strategy="{aggregation_strategy}"` instead.' + ) + + if aggregation_strategy is not None: + if isinstance(aggregation_strategy, str): + aggregation_strategy = AggregationStrategy[aggregation_strategy.upper()] + if ( + aggregation_strategy + in {AggregationStrategy.FIRST, AggregationStrategy.MAX, AggregationStrategy.AVERAGE} + and not self.tokenizer.is_fast + ): + raise ValueError( + "Slow tokenizers cannot handle subwords. Please set the `aggregation_strategy` option" + ' to `"simple"` or use a fast tokenizer.' + ) + postprocess_params["aggregation_strategy"] = aggregation_strategy + if ignore_labels is not None: + postprocess_params["ignore_labels"] = ignore_labels + if stride is not None: + if stride >= self.tokenizer.model_max_length: + raise ValueError( + "`stride` must be less than `tokenizer.model_max_length` (or even lower if the tokenizer adds special tokens)" + ) + if aggregation_strategy == AggregationStrategy.NONE: + raise ValueError( + "`stride` was provided to process all the text but `aggregation_strategy=" + f'"{aggregation_strategy}"`, please select another one instead.' + ) + else: + if self.tokenizer.is_fast: + tokenizer_params = { + "return_overflowing_tokens": True, + "padding": True, + "stride": stride, + } + preprocess_params["tokenizer_params"] = tokenizer_params + else: + raise ValueError( + "`stride` was provided to process all the text but you're using a slow tokenizer." + " Please use a fast tokenizer." + ) + return preprocess_params, {}, postprocess_params + + @overload + def __call__(self, inputs: str, **kwargs: Any) -> list[dict[str, str]]: ... + + @overload + def __call__(self, inputs: list[str], **kwargs: Any) -> list[list[dict[str, str]]]: ... + + def __call__( + self, inputs: Union[str, list[str]], **kwargs: Any + ) -> Union[list[dict[str, str]], list[list[dict[str, str]]]]: + """ + Classify each token of the text(s) given as inputs. + + Args: + inputs (`str` or `List[str]`): + One or several texts (or one list of texts) for token classification. Can be pre-tokenized when + `is_split_into_words=True`. + + Return: + A list or a list of list of `dict`: Each result comes as a list of dictionaries (one for each token in the + corresponding input, or each entity if this pipeline was instantiated with an aggregation_strategy) with + the following keys: + + - **word** (`str`) -- The token/word classified. This is obtained by decoding the selected tokens. If you + want to have the exact string in the original sentence, use `start` and `end`. + - **score** (`float`) -- The corresponding probability for `entity`. + - **entity** (`str`) -- The entity predicted for that token/word (it is named *entity_group* when + *aggregation_strategy* is not `"none"`. + - **index** (`int`, only present when `aggregation_strategy="none"`) -- The index of the corresponding + token in the sentence. + - **start** (`int`, *optional*) -- The index of the start of the corresponding entity in the sentence. Only + exists if the offsets are available within the tokenizer + - **end** (`int`, *optional*) -- The index of the end of the corresponding entity in the sentence. Only + exists if the offsets are available within the tokenizer + """ + + _inputs, is_split_into_words, offset_mapping, delimiter = self._args_parser(inputs, **kwargs) + kwargs["is_split_into_words"] = is_split_into_words + kwargs["delimiter"] = delimiter + if is_split_into_words and not all(isinstance(input, list) for input in inputs): + return super().__call__([inputs], **kwargs) + if offset_mapping: + kwargs["offset_mapping"] = offset_mapping + + return super().__call__(inputs, **kwargs) + + def preprocess(self, sentence, offset_mapping=None, **preprocess_params): + tokenizer_params = preprocess_params.pop("tokenizer_params", {}) + truncation = self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 + + word_to_chars_map = None + is_split_into_words = preprocess_params["is_split_into_words"] + if is_split_into_words: + delimiter = preprocess_params["delimiter"] + if not isinstance(sentence, list): + raise ValueError("When `is_split_into_words=True`, `sentence` must be a list of tokens.") + words = sentence + sentence = delimiter.join(words) # Recreate the sentence string for later display and slicing + # This map will allows to convert back word => char indices + word_to_chars_map = [] + delimiter_len = len(delimiter) + char_offset = 0 + for word in words: + word_to_chars_map.append((char_offset, char_offset + len(word))) + char_offset += len(word) + delimiter_len + + # We use `words` as the actual input for the tokenizer + text_to_tokenize = words + tokenizer_params["is_split_into_words"] = True + else: + if not isinstance(sentence, str): + raise ValueError("When `is_split_into_words=False`, `sentence` must be an untokenized string.") + text_to_tokenize = sentence + + inputs = self.tokenizer( + text_to_tokenize, + return_tensors=self.framework, + truncation=truncation, + return_special_tokens_mask=True, + return_offsets_mapping=self.tokenizer.is_fast, + **tokenizer_params, + ) + + if is_split_into_words and not self.tokenizer.is_fast: + raise ValueError("is_split_into_words=True is only supported with fast tokenizers.") + + inputs.pop("overflow_to_sample_mapping", None) + num_chunks = len(inputs["input_ids"]) + + for i in range(num_chunks): + if self.framework == "tf": + model_inputs = {k: tf.expand_dims(v[i], 0) for k, v in inputs.items()} + else: + model_inputs = {k: v[i].unsqueeze(0) for k, v in inputs.items()} + if offset_mapping is not None: + model_inputs["offset_mapping"] = offset_mapping + + model_inputs["sentence"] = sentence if i == 0 else None + model_inputs["is_last"] = i == num_chunks - 1 + if word_to_chars_map is not None: + model_inputs["word_ids"] = inputs.word_ids(i) + model_inputs["word_to_chars_map"] = word_to_chars_map + + yield model_inputs + + def _forward(self, model_inputs): + # Forward + special_tokens_mask = model_inputs.pop("special_tokens_mask") + offset_mapping = model_inputs.pop("offset_mapping", None) + sentence = model_inputs.pop("sentence") + is_last = model_inputs.pop("is_last") + word_ids = model_inputs.pop("word_ids", None) + word_to_chars_map = model_inputs.pop("word_to_chars_map", None) + + if self.framework == "tf": + logits = self.model(**model_inputs)[0] + else: + output = self.model(**model_inputs) + logits = output["logits"] if isinstance(output, dict) else output[0] + + return { + "logits": logits, + "special_tokens_mask": special_tokens_mask, + "offset_mapping": offset_mapping, + "sentence": sentence, + "is_last": is_last, + "word_ids": word_ids, + "word_to_chars_map": word_to_chars_map, + **model_inputs, + } + + def postprocess(self, all_outputs, aggregation_strategy=AggregationStrategy.NONE, ignore_labels=None): + if ignore_labels is None: + ignore_labels = ["O"] + all_entities = [] + + # Get map from the first output, it's the same for all chunks + word_to_chars_map = all_outputs[0].get("word_to_chars_map") + + for model_outputs in all_outputs: + if self.framework == "pt" and model_outputs["logits"][0].dtype in (torch.bfloat16, torch.float16): + logits = model_outputs["logits"][0].to(torch.float32).numpy() + else: + logits = model_outputs["logits"][0].numpy() + + sentence = all_outputs[0]["sentence"] + input_ids = model_outputs["input_ids"][0] + offset_mapping = ( + model_outputs["offset_mapping"][0] if model_outputs["offset_mapping"] is not None else None + ) + special_tokens_mask = model_outputs["special_tokens_mask"][0].numpy() + word_ids = model_outputs.get("word_ids") + + maxes = np.max(logits, axis=-1, keepdims=True) + shifted_exp = np.exp(logits - maxes) + scores = shifted_exp / shifted_exp.sum(axis=-1, keepdims=True) + + if self.framework == "tf": + input_ids = input_ids.numpy() + offset_mapping = offset_mapping.numpy() if offset_mapping is not None else None + + pre_entities = self.gather_pre_entities( + sentence, + input_ids, + scores, + offset_mapping, + special_tokens_mask, + aggregation_strategy, + word_ids=word_ids, + word_to_chars_map=word_to_chars_map, + ) + grouped_entities = self.aggregate(pre_entities, aggregation_strategy) + # Filter anything that is in self.ignore_labels + entities = [ + entity + for entity in grouped_entities + if entity.get("entity", None) not in ignore_labels + and entity.get("entity_group", None) not in ignore_labels + ] + all_entities.extend(entities) + num_chunks = len(all_outputs) + if num_chunks > 1: + all_entities = self.aggregate_overlapping_entities(all_entities) + return all_entities + + def aggregate_overlapping_entities(self, entities): + if len(entities) == 0: + return entities + entities = sorted(entities, key=lambda x: x["start"]) + aggregated_entities = [] + previous_entity = entities[0] + for entity in entities: + if previous_entity["start"] <= entity["start"] < previous_entity["end"]: + current_length = entity["end"] - entity["start"] + previous_length = previous_entity["end"] - previous_entity["start"] + if ( + current_length > previous_length + or current_length == previous_length + and entity["score"] > previous_entity["score"] + ): + previous_entity = entity + else: + aggregated_entities.append(previous_entity) + previous_entity = entity + aggregated_entities.append(previous_entity) + return aggregated_entities + + def gather_pre_entities( + self, + sentence: str, + input_ids: np.ndarray, + scores: np.ndarray, + offset_mapping: Optional[list[tuple[int, int]]], + special_tokens_mask: np.ndarray, + aggregation_strategy: AggregationStrategy, + word_ids: Optional[list[Optional[int]]] = None, + word_to_chars_map: Optional[list[tuple[int, int]]] = None, + ) -> list[dict]: + """Fuse various numpy arrays into dicts with all the information needed for aggregation""" + pre_entities = [] + for idx, token_scores in enumerate(scores): + # Filter special_tokens + if special_tokens_mask[idx]: + continue + + word = self.tokenizer.convert_ids_to_tokens(int(input_ids[idx])) + if offset_mapping is not None: + start_ind, end_ind = offset_mapping[idx] + + # If the input is pre-tokenized, we need to rescale the offsets to the absolute sentence. + if word_ids is not None and word_to_chars_map is not None: + word_index = word_ids[idx] + if word_index is not None: + start_char, _ = word_to_chars_map[word_index] + start_ind += start_char + end_ind += start_char + + if not isinstance(start_ind, int): + if self.framework == "pt": + start_ind = start_ind.item() + end_ind = end_ind.item() + word_ref = sentence[start_ind:end_ind] + if getattr(self.tokenizer, "_tokenizer", None) and getattr( + self.tokenizer._tokenizer.model, "continuing_subword_prefix", None + ): + # This is a BPE, word aware tokenizer, there is a correct way + # to fuse tokens + is_subword = len(word) != len(word_ref) + else: + # This is a fallback heuristic. This will fail most likely on any kind of text + punctuation mixtures that will be considered "words". Non word aware models cannot do better than this unfortunately. + if aggregation_strategy in { + AggregationStrategy.FIRST, + AggregationStrategy.AVERAGE, + AggregationStrategy.MAX, + }: + warnings.warn( + "Tokenizer does not support real words, using fallback heuristic", + UserWarning, + ) + is_subword = start_ind > 0 and " " not in sentence[start_ind - 1 : start_ind + 1] + + if int(input_ids[idx]) == self.tokenizer.unk_token_id: + word = word_ref + is_subword = False + else: + start_ind = None + end_ind = None + is_subword = False + + pre_entity = { + "word": word, + "scores": token_scores, + "start": start_ind, + "end": end_ind, + "index": idx, + "is_subword": is_subword, + } + pre_entities.append(pre_entity) + return pre_entities + + def aggregate(self, pre_entities: list[dict], aggregation_strategy: AggregationStrategy) -> list[dict]: + if aggregation_strategy in {AggregationStrategy.NONE, AggregationStrategy.SIMPLE}: + entities = [] + for pre_entity in pre_entities: + entity_idx = pre_entity["scores"].argmax() + score = pre_entity["scores"][entity_idx] + entity = { + "entity": self.model.config.id2label[entity_idx], + "score": score, + "index": pre_entity["index"], + "word": pre_entity["word"], + "start": pre_entity["start"], + "end": pre_entity["end"], + } + entities.append(entity) + else: + entities = self.aggregate_words(pre_entities, aggregation_strategy) + + if aggregation_strategy == AggregationStrategy.NONE: + return entities + return self.group_entities(entities) + + def aggregate_word(self, entities: list[dict], aggregation_strategy: AggregationStrategy) -> dict: + word = self.tokenizer.convert_tokens_to_string([entity["word"] for entity in entities]) + if aggregation_strategy == AggregationStrategy.FIRST: + scores = entities[0]["scores"] + idx = scores.argmax() + score = scores[idx] + entity = self.model.config.id2label[idx] + elif aggregation_strategy == AggregationStrategy.MAX: + max_entity = max(entities, key=lambda entity: entity["scores"].max()) + scores = max_entity["scores"] + idx = scores.argmax() + score = scores[idx] + entity = self.model.config.id2label[idx] + elif aggregation_strategy == AggregationStrategy.AVERAGE: + scores = np.stack([entity["scores"] for entity in entities]) + average_scores = np.nanmean(scores, axis=0) + entity_idx = average_scores.argmax() + entity = self.model.config.id2label[entity_idx] + score = average_scores[entity_idx] + else: + raise ValueError("Invalid aggregation_strategy") + new_entity = { + "entity": entity, + "score": score, + "word": word, + "start": entities[0]["start"], + "end": entities[-1]["end"], + } + return new_entity + + def aggregate_words(self, entities: list[dict], aggregation_strategy: AggregationStrategy) -> list[dict]: + """ + Override tokens from a given word that disagree to force agreement on word boundaries. + + Example: micro|soft| com|pany| B-ENT I-NAME I-ENT I-ENT will be rewritten with first strategy as microsoft| + company| B-ENT I-ENT + """ + if aggregation_strategy in { + AggregationStrategy.NONE, + AggregationStrategy.SIMPLE, + }: + raise ValueError("NONE and SIMPLE strategies are invalid for word aggregation") + + word_entities = [] + word_group = None + for entity in entities: + if word_group is None: + word_group = [entity] + elif entity["is_subword"]: + word_group.append(entity) + else: + word_entities.append(self.aggregate_word(word_group, aggregation_strategy)) + word_group = [entity] + # Last item + if word_group is not None: + word_entities.append(self.aggregate_word(word_group, aggregation_strategy)) + return word_entities + + def group_sub_entities(self, entities: list[dict]) -> dict: + """ + Group together the adjacent tokens with the same entity predicted. + + Args: + entities (`dict`): The entities predicted by the pipeline. + """ + # Get the first entity in the entity group + entity = entities[0]["entity"].split("-", 1)[-1] + scores = np.nanmean([entity["score"] for entity in entities]) + tokens = [entity["word"] for entity in entities] + + entity_group = { + "entity_group": entity, + "score": np.mean(scores), + "word": self.tokenizer.convert_tokens_to_string(tokens), + "start": entities[0]["start"], + "end": entities[-1]["end"], + } + return entity_group + + def get_tag(self, entity_name: str) -> tuple[str, str]: + if entity_name.startswith("B-"): + bi = "B" + tag = entity_name[2:] + elif entity_name.startswith("I-"): + bi = "I" + tag = entity_name[2:] + else: + # It's not in B-, I- format + # Default to I- for continuation. + bi = "I" + tag = entity_name + return bi, tag + + def group_entities(self, entities: list[dict]) -> list[dict]: + """ + Find and group together the adjacent tokens with the same entity predicted. + + Args: + entities (`dict`): The entities predicted by the pipeline. + """ + + entity_groups = [] + entity_group_disagg = [] + + for entity in entities: + if not entity_group_disagg: + entity_group_disagg.append(entity) + continue + + # If the current entity is similar and adjacent to the previous entity, + # append it to the disaggregated entity group + # The split is meant to account for the "B" and "I" prefixes + # Shouldn't merge if both entities are B-type + bi, tag = self.get_tag(entity["entity"]) + last_bi, last_tag = self.get_tag(entity_group_disagg[-1]["entity"]) + + if tag == last_tag and bi != "B": + # Modify subword type to be previous_type + entity_group_disagg.append(entity) + else: + # If the current entity is different from the previous entity + # aggregate the disaggregated entity group + entity_groups.append(self.group_sub_entities(entity_group_disagg)) + entity_group_disagg = [entity] + if entity_group_disagg: + # it's the last entity, add it to the entity groups + entity_groups.append(self.group_sub_entities(entity_group_disagg)) + + return entity_groups + + +NerPipeline = TokenClassificationPipeline diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/video_classification.py b/venv/lib/python3.13/site-packages/transformers/pipelines/video_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..1ee8dc86e161e3b24f1e71701239f6cc3b2d02a5 --- /dev/null +++ b/venv/lib/python3.13/site-packages/transformers/pipelines/video_classification.py @@ -0,0 +1,195 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import warnings +from io import BytesIO +from typing import Any, Optional, Union, overload + +import requests + +from ..utils import ( + add_end_docstrings, + is_av_available, + is_torch_available, + logging, + requires_backends, +) +from .base import Pipeline, build_pipeline_init_args + + +if is_av_available(): + import av + import numpy as np + + +if is_torch_available(): + from ..models.auto.modeling_auto import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES + +logger = logging.get_logger(__name__) + + +@add_end_docstrings(build_pipeline_init_args(has_image_processor=True)) +class VideoClassificationPipeline(Pipeline): + """ + Video classification pipeline using any `AutoModelForVideoClassification`. This pipeline predicts the class of a + video. + + This video classification pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"video-classification"`. + + See the list of available models on + [huggingface.co/models](https://huggingface.co/models?filter=video-classification). + """ + + _load_processor = False + _load_image_processor = True + _load_feature_extractor = False + _load_tokenizer = False + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + requires_backends(self, "av") + self.check_model_type(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES) + + def _sanitize_parameters(self, top_k=None, num_frames=None, frame_sampling_rate=None, function_to_apply=None): + preprocess_params = {} + if frame_sampling_rate is not None: + preprocess_params["frame_sampling_rate"] = frame_sampling_rate + if num_frames is not None: + preprocess_params["num_frames"] = num_frames + + postprocess_params = {} + if top_k is not None: + postprocess_params["top_k"] = top_k + if function_to_apply is not None: + if function_to_apply not in ["softmax", "sigmoid", "none"]: + raise ValueError( + f"Invalid value for `function_to_apply`: {function_to_apply}. " + "Valid options are ['softmax', 'sigmoid', 'none']" + ) + postprocess_params["function_to_apply"] = function_to_apply + else: + postprocess_params["function_to_apply"] = "softmax" + return preprocess_params, {}, postprocess_params + + @overload + def __call__(self, inputs: str, **kwargs: Any) -> list[dict[str, Any]]: ... + + @overload + def __call__(self, inputs: list[str], **kwargs: Any) -> list[list[dict[str, Any]]]: ... + + def __call__(self, inputs: Optional[Union[str, list[str]]] = None, **kwargs): + """ + Assign labels to the video(s) passed as inputs. + + Args: + inputs (`str`, `list[str]`): + The pipeline handles three types of videos: + + - A string containing a http link pointing to a video + - A string containing a local path to a video + + The pipeline accepts either a single video or a batch of videos, which must then be passed as a string. + Videos in a batch must all be in the same format: all as http links or all as local paths. + top_k (`int`, *optional*, defaults to 5): + The number of top labels that will be returned by the pipeline. If the provided number is higher than + the number of labels available in the model configuration, it will default to the number of labels. + num_frames (`int`, *optional*, defaults to `self.model.config.num_frames`): + The number of frames sampled from the video to run the classification on. If not provided, will default + to the number of frames specified in the model configuration. + frame_sampling_rate (`int`, *optional*, defaults to 1): + The sampling rate used to select frames from the video. If not provided, will default to 1, i.e. every + frame will be used. + function_to_apply(`str`, *optional*, defaults to "softmax"): + The function to apply to the model output. By default, the pipeline will apply the softmax function to + the output of the model. Valid options: ["softmax", "sigmoid", "none"]. Note that passing Python's + built-in `None` will default to "softmax", so you need to pass the string "none" to disable any + post-processing. + + Return: + A list of dictionaries or a list of list of dictionaries containing result. If the input is a single video, + will return a list of `top_k` dictionaries, if the input is a list of several videos, will return a list of list of + `top_k` dictionaries corresponding to the videos. + + The dictionaries contain the following keys: + + - **label** (`str`) -- The label identified by the model. + - **score** (`int`) -- The score attributed by the model for that label. + """ + # After deprecation of this is completed, remove the default `None` value for `images` + if "videos" in kwargs: + warnings.warn( + "The `videos` argument has been renamed to `inputs`. In version 5 of Transformers, `videos` will no longer be accepted", + FutureWarning, + ) + inputs = kwargs.pop("videos") + if inputs is None: + raise ValueError("Cannot call the video-classification pipeline without an inputs argument!") + return super().__call__(inputs, **kwargs) + + def preprocess(self, video, num_frames=None, frame_sampling_rate=1): + if num_frames is None: + num_frames = self.model.config.num_frames + + if video.startswith("http://") or video.startswith("https://"): + video = BytesIO(requests.get(video).content) + + container = av.open(video) + + start_idx = 0 + end_idx = num_frames * frame_sampling_rate - 1 + indices = np.linspace(start_idx, end_idx, num=num_frames, dtype=np.int64) + + video = read_video_pyav(container, indices) + video = list(video) + + model_inputs = self.image_processor(video, return_tensors=self.framework) + if self.framework == "pt": + model_inputs = model_inputs.to(self.dtype) + return model_inputs + + def _forward(self, model_inputs): + model_outputs = self.model(**model_inputs) + return model_outputs + + def postprocess(self, model_outputs, top_k=5, function_to_apply="softmax"): + if top_k > self.model.config.num_labels: + top_k = self.model.config.num_labels + + if self.framework == "pt": + if function_to_apply == "softmax": + probs = model_outputs.logits[0].softmax(-1) + elif function_to_apply == "sigmoid": + probs = model_outputs.logits[0].sigmoid() + else: + probs = model_outputs.logits[0] + scores, ids = probs.topk(top_k) + else: + raise ValueError(f"Unsupported framework: {self.framework}") + + scores = scores.tolist() + ids = ids.tolist() + return [{"score": score, "label": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)] + + +def read_video_pyav(container, indices): + frames = [] + container.seek(0) + start_index = indices[0] + end_index = indices[-1] + for i, frame in enumerate(container.decode(video=0)): + if i > end_index: + break + if i >= start_index and i in indices: + frames.append(frame) + return np.stack([x.to_ndarray(format="rgb24") for x in frames]) diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/visual_question_answering.py b/venv/lib/python3.13/site-packages/transformers/pipelines/visual_question_answering.py new file mode 100644 index 0000000000000000000000000000000000000000..609eaf2e9d554c3215c4867060779376fccc218a --- /dev/null +++ b/venv/lib/python3.13/site-packages/transformers/pipelines/visual_question_answering.py @@ -0,0 +1,216 @@ +from typing import Optional, Union + +from ..generation import GenerationConfig +from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging +from .base import Pipeline, build_pipeline_init_args + + +if is_vision_available(): + from PIL import Image + + from ..image_utils import load_image + +if is_torch_available(): + from ..models.auto.modeling_auto import MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES + from .pt_utils import KeyDataset + +logger = logging.get_logger(__name__) + + +@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True, has_image_processor=True)) +class VisualQuestionAnsweringPipeline(Pipeline): + """ + Visual Question Answering pipeline using a `AutoModelForVisualQuestionAnswering`. This pipeline is currently only + available in PyTorch. + + Unless the model you're using explicitly sets these generation parameters in its configuration files + (`generation_config.json`), the following default values will be used: + - max_new_tokens: 256 + + Example: + + ```python + >>> from transformers import pipeline + + >>> oracle = pipeline(model="dandelin/vilt-b32-finetuned-vqa") + >>> image_url = "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/lena.png" + >>> oracle(question="What is she wearing ?", image=image_url) + [{'score': 0.948, 'answer': 'hat'}, {'score': 0.009, 'answer': 'fedora'}, {'score': 0.003, 'answer': 'clothes'}, {'score': 0.003, 'answer': 'sun hat'}, {'score': 0.002, 'answer': 'nothing'}] + + >>> oracle(question="What is she wearing ?", image=image_url, top_k=1) + [{'score': 0.948, 'answer': 'hat'}] + + >>> oracle(question="Is this a person ?", image=image_url, top_k=1) + [{'score': 0.993, 'answer': 'yes'}] + + >>> oracle(question="Is this a man ?", image=image_url, top_k=1) + [{'score': 0.996, 'answer': 'no'}] + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + This visual question answering pipeline can currently be loaded from [`pipeline`] using the following task + identifiers: `"visual-question-answering", "vqa"`. + + The models that this pipeline can use are models that have been fine-tuned on a visual question answering task. See + the up-to-date list of available models on + [huggingface.co/models](https://huggingface.co/models?filter=visual-question-answering). + """ + + _load_processor = False + _load_image_processor = True + _load_feature_extractor = False + _load_tokenizer = True + + _pipeline_calls_generate = True + # Make sure the docstring is updated when the default generation config is changed + _default_generation_config = GenerationConfig( + max_new_tokens=256, + ) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.check_model_type(MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES) + + def _sanitize_parameters(self, top_k=None, padding=None, truncation=None, timeout=None, **kwargs): + preprocess_params, postprocess_params = {}, {} + if padding is not None: + preprocess_params["padding"] = padding + if truncation is not None: + preprocess_params["truncation"] = truncation + if timeout is not None: + preprocess_params["timeout"] = timeout + if top_k is not None: + postprocess_params["top_k"] = top_k + + forward_params = {} + if getattr(self, "assistant_model", None) is not None: + forward_params["assistant_model"] = self.assistant_model + if getattr(self, "assistant_tokenizer", None) is not None: + forward_params["tokenizer"] = self.tokenizer + forward_params["assistant_tokenizer"] = self.assistant_tokenizer + + return preprocess_params, forward_params, postprocess_params + + def __call__( + self, + image: Union["Image.Image", str, list["Image.Image"], list[str], "KeyDataset"], + question: Optional[Union[str, list[str]]] = None, + **kwargs, + ): + r""" + Answers open-ended questions about images. The pipeline accepts several types of inputs which are detailed + below: + + - `pipeline(image=image, question=question)` + - `pipeline({"image": image, "question": question})` + - `pipeline([{"image": image, "question": question}])` + - `pipeline([{"image": image, "question": question}, {"image": image, "question": question}])` + + Args: + image (`str`, `list[str]`, `PIL.Image`, `list[PIL.Image]` or `KeyDataset`): + The pipeline handles three types of images: + + - A string containing a http link pointing to an image + - A string containing a local path to an image + - An image loaded in PIL directly + + The pipeline accepts either a single image or a batch of images. If given a single image, it can be + broadcasted to multiple questions. + For dataset: the passed in dataset must be of type `transformers.pipelines.pt_utils.KeyDataset` + Example: + ```python + >>> from transformers.pipelines.pt_utils import KeyDataset + >>> from datasets import load_dataset + + >>> dataset = load_dataset("detection-datasets/coco") + >>> oracle(image=KeyDataset(dataset, "image"), question="What's in this image?") + + ``` + question (`str`, `list[str]`): + The question(s) asked. If given a single question, it can be broadcasted to multiple images. + If multiple images and questions are given, each and every question will be broadcasted to all images + (same effect as a Cartesian product) + top_k (`int`, *optional*, defaults to 5): + The number of top labels that will be returned by the pipeline. If the provided number is higher than + the number of labels available in the model configuration, it will default to the number of labels. + timeout (`float`, *optional*, defaults to None): + The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and + the call may block forever. + Return: + A dictionary or a list of dictionaries containing the result. The dictionaries contain the following keys: + + - **label** (`str`) -- The label identified by the model. + - **score** (`int`) -- The score attributed by the model for that label. + """ + is_dataset = isinstance(image, KeyDataset) + is_image_batch = isinstance(image, list) and all(isinstance(item, (Image.Image, str)) for item in image) + is_question_batch = isinstance(question, list) and all(isinstance(item, str) for item in question) + + if isinstance(image, (Image.Image, str)) and isinstance(question, str): + inputs = {"image": image, "question": question} + elif (is_image_batch or is_dataset) and isinstance(question, str): + inputs = [{"image": im, "question": question} for im in image] + elif isinstance(image, (Image.Image, str)) and is_question_batch: + inputs = [{"image": image, "question": q} for q in question] + elif (is_image_batch or is_dataset) and is_question_batch: + question_image_pairs = [] + for q in question: + for im in image: + question_image_pairs.append({"image": im, "question": q}) + inputs = question_image_pairs + else: + """ + Supports the following format + - {"image": image, "question": question} + - [{"image": image, "question": question}] + - Generator and datasets + """ + inputs = image + results = super().__call__(inputs, **kwargs) + return results + + def preprocess(self, inputs, padding=False, truncation=False, timeout=None): + image = load_image(inputs["image"], timeout=timeout) + model_inputs = self.tokenizer( + inputs["question"], + return_tensors=self.framework, + padding=padding, + truncation=truncation, + ) + image_features = self.image_processor(images=image, return_tensors=self.framework) + if self.framework == "pt": + image_features = image_features.to(self.dtype) + model_inputs.update(image_features) + return model_inputs + + def _forward(self, model_inputs, **generate_kwargs): + if self.model.can_generate(): + # User-defined `generation_config` passed to the pipeline call take precedence + if "generation_config" not in generate_kwargs: + generate_kwargs["generation_config"] = self.generation_config + + model_outputs = self.model.generate(**model_inputs, **generate_kwargs) + else: + model_outputs = self.model(**model_inputs) + return model_outputs + + def postprocess(self, model_outputs, top_k=5): + if self.model.can_generate(): + return [ + {"answer": self.tokenizer.decode(output_ids, skip_special_tokens=True).strip()} + for output_ids in model_outputs + ] + else: + if top_k > self.model.config.num_labels: + top_k = self.model.config.num_labels + + if self.framework == "pt": + probs = model_outputs.logits.sigmoid()[0] + scores, ids = probs.topk(top_k) + else: + raise ValueError(f"Unsupported framework: {self.framework}") + + scores = scores.tolist() + ids = ids.tolist() + return [{"score": score, "answer": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)] diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/zero_shot_audio_classification.py b/venv/lib/python3.13/site-packages/transformers/pipelines/zero_shot_audio_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..9c21681a0d8ef1c93eb16bbb1740a509f091a0bb --- /dev/null +++ b/venv/lib/python3.13/site-packages/transformers/pipelines/zero_shot_audio_classification.py @@ -0,0 +1,169 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from collections import UserDict +from typing import Any, Union + +import numpy as np +import requests + +from ..utils import ( + add_end_docstrings, + logging, +) +from .audio_classification import ffmpeg_read +from .base import Pipeline, build_pipeline_init_args + + +logger = logging.get_logger(__name__) + + +@add_end_docstrings(build_pipeline_init_args(has_feature_extractor=True, has_tokenizer=True)) +class ZeroShotAudioClassificationPipeline(Pipeline): + """ + Zero shot audio classification pipeline using `ClapModel`. This pipeline predicts the class of an audio when you + provide an audio and a set of `candidate_labels`. + + + + The default `hypothesis_template` is : `"This is a sound of {}."`. Make sure you update it for your usage. + + + + Example: + ```python + >>> from transformers import pipeline + >>> from datasets import load_dataset + + >>> dataset = load_dataset("ashraq/esc50") + >>> audio = next(iter(dataset["train"]["audio"]))["array"] + >>> classifier = pipeline(task="zero-shot-audio-classification", model="laion/clap-htsat-unfused") + >>> classifier(audio, candidate_labels=["Sound of a dog", "Sound of vacuum cleaner"]) + [{'score': 0.9996, 'label': 'Sound of a dog'}, {'score': 0.0004, 'label': 'Sound of vacuum cleaner'}] + ``` + + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) This audio + classification pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"zero-shot-audio-classification"`. See the list of available models on + [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-audio-classification). + """ + + _load_processor = False + _load_image_processor = False + _load_feature_extractor = True + _load_tokenizer = True + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + if self.framework != "pt": + raise ValueError(f"The {self.__class__} is only available in PyTorch.") + # No specific FOR_XXX available yet + + def __call__(self, audios: Union[np.ndarray, bytes, str, dict], **kwargs: Any) -> list[dict[str, Any]]: + """ + Assign labels to the audio(s) passed as inputs. + + Args: + audios (`str`, `list[str]`, `np.array` or `list[np.array]`): + The pipeline handles three types of inputs: + - A string containing a http link pointing to an audio + - A string containing a local path to an audio + - An audio loaded in numpy + candidate_labels (`list[str]`): + The candidate labels for this audio. They will be formatted using *hypothesis_template*. + hypothesis_template (`str`, *optional*, defaults to `"This is a sound of {}"`): + The format used in conjunction with *candidate_labels* to attempt the audio classification by + replacing the placeholder with the candidate_labels. Pass "{}" if *candidate_labels* are + already formatted. + Return: + A list of dictionaries containing one entry per proposed label. Each dictionary contains the + following keys: + - **label** (`str`) -- One of the suggested *candidate_labels*. + - **score** (`float`) -- The score attributed by the model to that label. It is a value between + 0 and 1, computed as the `softmax` of `logits_per_audio`. + """ + return super().__call__(audios, **kwargs) + + def _sanitize_parameters(self, **kwargs): + preprocess_params = {} + if "candidate_labels" in kwargs: + preprocess_params["candidate_labels"] = kwargs["candidate_labels"] + if "hypothesis_template" in kwargs: + preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"] + + return preprocess_params, {}, {} + + def preprocess(self, audio, candidate_labels=None, hypothesis_template="This is a sound of {}."): + if isinstance(audio, str): + if audio.startswith("http://") or audio.startswith("https://"): + # We need to actually check for a real protocol, otherwise it's impossible to use a local file + # like http_huggingface_co.png + audio = requests.get(audio).content + else: + with open(audio, "rb") as f: + audio = f.read() + + if isinstance(audio, bytes): + audio = ffmpeg_read(audio, self.feature_extractor.sampling_rate) + + if not isinstance(audio, np.ndarray): + raise TypeError("We expect a numpy ndarray as input") + if len(audio.shape) != 1: + raise ValueError("We expect a single channel audio input for ZeroShotAudioClassificationPipeline") + + inputs = self.feature_extractor( + [audio], sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt" + ) + if self.framework == "pt": + inputs = inputs.to(self.dtype) + inputs["candidate_labels"] = candidate_labels + sequences = [hypothesis_template.format(x) for x in candidate_labels] + text_inputs = self.tokenizer(sequences, return_tensors=self.framework, padding=True) + inputs["text_inputs"] = [text_inputs] + return inputs + + def _forward(self, model_inputs): + candidate_labels = model_inputs.pop("candidate_labels") + text_inputs = model_inputs.pop("text_inputs") + if isinstance(text_inputs[0], UserDict): + text_inputs = text_inputs[0] + else: + # Batching case. + text_inputs = text_inputs[0][0] + + outputs = self.model(**text_inputs, **model_inputs) + + model_outputs = { + "candidate_labels": candidate_labels, + "logits": outputs.logits_per_audio, + } + return model_outputs + + def postprocess(self, model_outputs): + candidate_labels = model_outputs.pop("candidate_labels") + logits = model_outputs["logits"][0] + + if self.framework == "pt": + probs = logits.softmax(dim=0) + scores = probs.tolist() + else: + raise ValueError("`tf` framework not supported.") + + result = [ + {"score": score, "label": candidate_label} + for score, candidate_label in sorted(zip(scores, candidate_labels), key=lambda x: -x[0]) + ] + return result diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/zero_shot_classification.py b/venv/lib/python3.13/site-packages/transformers/pipelines/zero_shot_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..d7a609bcd1677bad2b4a09fb1955073714201125 --- /dev/null +++ b/venv/lib/python3.13/site-packages/transformers/pipelines/zero_shot_classification.py @@ -0,0 +1,271 @@ +import inspect +from typing import Union + +import numpy as np + +from ..tokenization_utils import TruncationStrategy +from ..utils import add_end_docstrings, logging +from .base import ArgumentHandler, ChunkPipeline, build_pipeline_init_args + + +logger = logging.get_logger(__name__) + + +class ZeroShotClassificationArgumentHandler(ArgumentHandler): + """ + Handles arguments for zero-shot for text classification by turning each possible label into an NLI + premise/hypothesis pair. + """ + + def _parse_labels(self, labels): + if isinstance(labels, str): + labels = [label.strip() for label in labels.split(",") if label.strip()] + return labels + + def __call__(self, sequences, labels, hypothesis_template): + if len(labels) == 0 or len(sequences) == 0: + raise ValueError("You must include at least one label and at least one sequence.") + if hypothesis_template.format(labels[0]) == hypothesis_template: + raise ValueError( + f'The provided hypothesis_template "{hypothesis_template}" was not able to be formatted with the target labels. ' + "Make sure the passed template includes formatting syntax such as {} where the label should go." + ) + + if isinstance(sequences, str): + sequences = [sequences] + + sequence_pairs = [] + for sequence in sequences: + sequence_pairs.extend([[sequence, hypothesis_template.format(label)] for label in labels]) + + return sequence_pairs, sequences + + +@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True)) +class ZeroShotClassificationPipeline(ChunkPipeline): + """ + NLI-based zero-shot classification pipeline using a `ModelForSequenceClassification` trained on NLI (natural + language inference) tasks. Equivalent of `text-classification` pipelines, but these models don't require a + hardcoded number of potential classes, they can be chosen at runtime. It usually means it's slower but it is + **much** more flexible. + + Any combination of sequences and labels can be passed and each combination will be posed as a premise/hypothesis + pair and passed to the pretrained model. Then, the logit for *entailment* is taken as the logit for the candidate + label being valid. Any NLI model can be used, but the id of the *entailment* label must be included in the model + config's :attr:*~transformers.PretrainedConfig.label2id*. + + Example: + + ```python + >>> from transformers import pipeline + + >>> oracle = pipeline(model="facebook/bart-large-mnli") + >>> oracle( + ... "I have a problem with my iphone that needs to be resolved asap!!", + ... candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"], + ... ) + {'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'], 'scores': [0.504, 0.479, 0.013, 0.003, 0.002]} + + >>> oracle( + ... "I have a problem with my iphone that needs to be resolved asap!!", + ... candidate_labels=["english", "german"], + ... ) + {'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['english', 'german'], 'scores': [0.814, 0.186]} + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + This NLI pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"zero-shot-classification"`. + + The models that this pipeline can use are models that have been fine-tuned on an NLI task. See the up-to-date list + of available models on [huggingface.co/models](https://huggingface.co/models?search=nli). + """ + + _load_processor = False + _load_image_processor = False + _load_feature_extractor = False + _load_tokenizer = True + + def __init__(self, args_parser=ZeroShotClassificationArgumentHandler(), **kwargs): + self._args_parser = args_parser + super().__init__(**kwargs) + if self.entailment_id == -1: + logger.warning( + "Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to " + "-1. Define a descriptive label2id mapping in the model config to ensure correct outputs." + ) + + @property + def entailment_id(self): + for label, ind in self.model.config.label2id.items(): + if label.lower().startswith("entail"): + return ind + return -1 + + def _parse_and_tokenize( + self, sequence_pairs, padding=True, add_special_tokens=True, truncation=TruncationStrategy.ONLY_FIRST, **kwargs + ): + """ + Parse arguments and tokenize only_first so that hypothesis (label) is not truncated + """ + return_tensors = self.framework + if self.tokenizer.pad_token is None: + # Override for tokenizers not supporting padding + logger.error( + "Tokenizer was not supporting padding necessary for zero-shot, attempting to use " + " `pad_token=eos_token`" + ) + self.tokenizer.pad_token = self.tokenizer.eos_token + try: + inputs = self.tokenizer( + sequence_pairs, + add_special_tokens=add_special_tokens, + return_tensors=return_tensors, + padding=padding, + truncation=truncation, + ) + except Exception as e: + if "too short" in str(e): + # tokenizers might yell that we want to truncate + # to a value that is not even reached by the input. + # In that case we don't want to truncate. + # It seems there's not a really better way to catch that + # exception. + + inputs = self.tokenizer( + sequence_pairs, + add_special_tokens=add_special_tokens, + return_tensors=return_tensors, + padding=padding, + truncation=TruncationStrategy.DO_NOT_TRUNCATE, + ) + else: + raise e + + return inputs + + def _sanitize_parameters(self, **kwargs): + if kwargs.get("multi_class") is not None: + kwargs["multi_label"] = kwargs["multi_class"] + logger.warning( + "The `multi_class` argument has been deprecated and renamed to `multi_label`. " + "`multi_class` will be removed in a future version of Transformers." + ) + preprocess_params = {} + if "candidate_labels" in kwargs: + preprocess_params["candidate_labels"] = self._args_parser._parse_labels(kwargs["candidate_labels"]) + if "hypothesis_template" in kwargs: + preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"] + + postprocess_params = {} + if "multi_label" in kwargs: + postprocess_params["multi_label"] = kwargs["multi_label"] + return preprocess_params, {}, postprocess_params + + def __call__( + self, + sequences: Union[str, list[str]], + *args, + **kwargs, + ): + """ + Classify the sequence(s) given as inputs. See the [`ZeroShotClassificationPipeline`] documentation for more + information. + + Args: + sequences (`str` or `list[str]`): + The sequence(s) to classify, will be truncated if the model input is too large. + candidate_labels (`str` or `list[str]`): + The set of possible class labels to classify each sequence into. Can be a single label, a string of + comma-separated labels, or a list of labels. + hypothesis_template (`str`, *optional*, defaults to `"This example is {}."`): + The template used to turn each label into an NLI-style hypothesis. This template must include a {} or + similar syntax for the candidate label to be inserted into the template. For example, the default + template is `"This example is {}."` With the candidate label `"sports"`, this would be fed into the + model like `" sequence to classify This example is sports . "`. The default template + works well in many cases, but it may be worthwhile to experiment with different templates depending on + the task setting. + multi_label (`bool`, *optional*, defaults to `False`): + Whether or not multiple candidate labels can be true. If `False`, the scores are normalized such that + the sum of the label likelihoods for each sequence is 1. If `True`, the labels are considered + independent and probabilities are normalized for each candidate by doing a softmax of the entailment + score vs. the contradiction score. + + Return: + A `dict` or a list of `dict`: Each result comes as a dictionary with the following keys: + + - **sequence** (`str`) -- The sequence for which this is the output. + - **labels** (`list[str]`) -- The labels sorted by order of likelihood. + - **scores** (`list[float]`) -- The probabilities for each of the labels. + """ + if len(args) == 0: + pass + elif len(args) == 1 and "candidate_labels" not in kwargs: + kwargs["candidate_labels"] = args[0] + else: + raise ValueError(f"Unable to understand extra arguments {args}") + + return super().__call__(sequences, **kwargs) + + def preprocess(self, inputs, candidate_labels=None, hypothesis_template="This example is {}."): + sequence_pairs, sequences = self._args_parser(inputs, candidate_labels, hypothesis_template) + + for i, (candidate_label, sequence_pair) in enumerate(zip(candidate_labels, sequence_pairs)): + model_input = self._parse_and_tokenize([sequence_pair]) + + yield { + "candidate_label": candidate_label, + "sequence": sequences[0], + "is_last": i == len(candidate_labels) - 1, + **model_input, + } + + def _forward(self, inputs): + candidate_label = inputs["candidate_label"] + sequence = inputs["sequence"] + model_inputs = {k: inputs[k] for k in self.tokenizer.model_input_names} + # `XXXForSequenceClassification` models should not use `use_cache=True` even if it's supported + model_forward = self.model.forward if self.framework == "pt" else self.model.call + if "use_cache" in inspect.signature(model_forward).parameters: + model_inputs["use_cache"] = False + outputs = self.model(**model_inputs) + + model_outputs = { + "candidate_label": candidate_label, + "sequence": sequence, + "is_last": inputs["is_last"], + **outputs, + } + return model_outputs + + def postprocess(self, model_outputs, multi_label=False): + candidate_labels = [outputs["candidate_label"] for outputs in model_outputs] + sequences = [outputs["sequence"] for outputs in model_outputs] + if self.framework == "pt": + logits = np.concatenate([output["logits"].float().numpy() for output in model_outputs]) + else: + logits = np.concatenate([output["logits"].numpy() for output in model_outputs]) + N = logits.shape[0] + n = len(candidate_labels) + num_sequences = N // n + reshaped_outputs = logits.reshape((num_sequences, n, -1)) + + if multi_label or len(candidate_labels) == 1: + # softmax over the entailment vs. contradiction dim for each label independently + entailment_id = self.entailment_id + contradiction_id = -1 if entailment_id == 0 else 0 + entail_contr_logits = reshaped_outputs[..., [contradiction_id, entailment_id]] + scores = np.exp(entail_contr_logits) / np.exp(entail_contr_logits).sum(-1, keepdims=True) + scores = scores[..., 1] + else: + # softmax the "entailment" logits over all candidate labels + entail_logits = reshaped_outputs[..., self.entailment_id] + scores = np.exp(entail_logits) / np.exp(entail_logits).sum(-1, keepdims=True) + + top_inds = list(reversed(scores[0].argsort())) + return { + "sequence": sequences[0], + "labels": [candidate_labels[i] for i in top_inds], + "scores": scores[0, top_inds].tolist(), + } diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/zero_shot_image_classification.py b/venv/lib/python3.13/site-packages/transformers/pipelines/zero_shot_image_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..6aeb9162030689b989fa6e1eabf4445dc9471372 --- /dev/null +++ b/venv/lib/python3.13/site-packages/transformers/pipelines/zero_shot_image_classification.py @@ -0,0 +1,216 @@ +import warnings +from collections import UserDict +from typing import Any, Union, overload + +from ..utils import ( + add_end_docstrings, + is_tf_available, + is_torch_available, + is_vision_available, + logging, + requires_backends, +) +from .base import Pipeline, build_pipeline_init_args + + +if is_vision_available(): + from PIL import Image + + from ..image_utils import load_image + +if is_torch_available(): + import torch + + from ..models.auto.modeling_auto import MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES + +if is_tf_available(): + from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES + from ..tf_utils import stable_softmax + +logger = logging.get_logger(__name__) + + +@add_end_docstrings(build_pipeline_init_args(has_image_processor=True)) +class ZeroShotImageClassificationPipeline(Pipeline): + """ + Zero shot image classification pipeline using `CLIPModel`. This pipeline predicts the class of an image when you + provide an image and a set of `candidate_labels`. + + Example: + + ```python + >>> from transformers import pipeline + + >>> classifier = pipeline(model="google/siglip-so400m-patch14-384") + >>> classifier( + ... "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", + ... candidate_labels=["animals", "humans", "landscape"], + ... ) + [{'score': 0.965, 'label': 'animals'}, {'score': 0.03, 'label': 'humans'}, {'score': 0.005, 'label': 'landscape'}] + + >>> classifier( + ... "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", + ... candidate_labels=["black and white", "photorealist", "painting"], + ... ) + [{'score': 0.996, 'label': 'black and white'}, {'score': 0.003, 'label': 'photorealist'}, {'score': 0.0, 'label': 'painting'}] + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + This image classification pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"zero-shot-image-classification"`. + + See the list of available models on + [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-image-classification). + """ + + _load_processor = False + _load_image_processor = True + _load_feature_extractor = False + _load_tokenizer = True + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + requires_backends(self, "vision") + self.check_model_type( + TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES + if self.framework == "tf" + else MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES + ) + + @overload + def __call__( + self, image: Union[str, "Image.Image"], candidate_labels: list[str], **kwargs: Any + ) -> list[dict[str, Any]]: ... + + @overload + def __call__( + self, image: Union[list[str], list["Image.Image"]], candidate_labels: list[str], **kwargs: Any + ) -> list[list[dict[str, Any]]]: ... + + def __call__( + self, + image: Union[str, list[str], "Image.Image", list["Image.Image"]], + candidate_labels: list[str], + **kwargs: Any, + ) -> Union[list[dict[str, Any]], list[list[dict[str, Any]]]]: + """ + Assign labels to the image(s) passed as inputs. + + Args: + image (`str`, `list[str]`, `PIL.Image` or `list[PIL.Image]`): + The pipeline handles three types of images: + + - A string containing a http link pointing to an image + - A string containing a local path to an image + - An image loaded in PIL directly + + candidate_labels (`list[str]`): + The candidate labels for this image. They will be formatted using *hypothesis_template*. + + hypothesis_template (`str`, *optional*, defaults to `"This is a photo of {}"`): + The format used in conjunction with *candidate_labels* to attempt the image classification by + replacing the placeholder with the candidate_labels. Pass "{}" if *candidate_labels* are + already formatted. + + timeout (`float`, *optional*, defaults to None): + The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and + the call may block forever. + + Return: + A list of dictionaries containing one entry per proposed label. Each dictionary contains the + following keys: + - **label** (`str`) -- One of the suggested *candidate_labels*. + - **score** (`float`) -- The score attributed by the model to that label. It is a value between + 0 and 1, computed as the `softmax` of `logits_per_image`. + """ + # After deprecation of this is completed, remove the default `None` value for `image` + if "images" in kwargs: + image = kwargs.pop("images") + if image is None: + raise ValueError("Cannot call the zero-shot-image-classification pipeline without an images argument!") + return super().__call__(image, candidate_labels=candidate_labels, **kwargs) + + def _sanitize_parameters(self, tokenizer_kwargs=None, **kwargs): + preprocess_params = {} + if "candidate_labels" in kwargs: + preprocess_params["candidate_labels"] = kwargs["candidate_labels"] + if "timeout" in kwargs: + preprocess_params["timeout"] = kwargs["timeout"] + if "hypothesis_template" in kwargs: + preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"] + if tokenizer_kwargs is not None: + warnings.warn( + "The `tokenizer_kwargs` argument is deprecated and will be removed in version 5 of Transformers", + FutureWarning, + ) + preprocess_params["tokenizer_kwargs"] = tokenizer_kwargs + + return preprocess_params, {}, {} + + def preprocess( + self, + image, + candidate_labels=None, + hypothesis_template="This is a photo of {}.", + timeout=None, + tokenizer_kwargs=None, + ): + if tokenizer_kwargs is None: + tokenizer_kwargs = {} + image = load_image(image, timeout=timeout) + inputs = self.image_processor(images=[image], return_tensors=self.framework) + if self.framework == "pt": + inputs = inputs.to(self.dtype) + inputs["candidate_labels"] = candidate_labels + sequences = [hypothesis_template.format(x) for x in candidate_labels] + tokenizer_default_kwargs = {"padding": True} + if "siglip" in self.model.config.model_type: + tokenizer_default_kwargs.update(padding="max_length", max_length=64, truncation=True) + tokenizer_default_kwargs.update(tokenizer_kwargs) + text_inputs = self.tokenizer(sequences, return_tensors=self.framework, **tokenizer_default_kwargs) + inputs["text_inputs"] = [text_inputs] + return inputs + + def _forward(self, model_inputs): + candidate_labels = model_inputs.pop("candidate_labels") + text_inputs = model_inputs.pop("text_inputs") + if isinstance(text_inputs[0], UserDict): + text_inputs = text_inputs[0] + else: + # Batching case. + text_inputs = text_inputs[0][0] + + outputs = self.model(**text_inputs, **model_inputs) + + model_outputs = { + "candidate_labels": candidate_labels, + "logits": outputs.logits_per_image, + } + return model_outputs + + def postprocess(self, model_outputs): + candidate_labels = model_outputs.pop("candidate_labels") + logits = model_outputs["logits"][0] + if self.framework == "pt" and "siglip" in self.model.config.model_type: + probs = torch.sigmoid(logits).squeeze(-1) + scores = probs.tolist() + if not isinstance(scores, list): + scores = [scores] + elif self.framework == "pt": + probs = logits.softmax(dim=-1).squeeze(-1) + scores = probs.tolist() + if not isinstance(scores, list): + scores = [scores] + elif self.framework == "tf": + probs = stable_softmax(logits, axis=-1) + scores = probs.numpy().tolist() + else: + raise ValueError(f"Unsupported framework: {self.framework}") + + result = [ + {"score": score, "label": candidate_label} + for score, candidate_label in sorted(zip(scores, candidate_labels), key=lambda x: -x[0]) + ] + return result diff --git a/venv/lib/python3.13/site-packages/transformers/pipelines/zero_shot_object_detection.py b/venv/lib/python3.13/site-packages/transformers/pipelines/zero_shot_object_detection.py new file mode 100644 index 0000000000000000000000000000000000000000..55154af9ab3b30e93adf904e25b13b4930b5aa53 --- /dev/null +++ b/venv/lib/python3.13/site-packages/transformers/pipelines/zero_shot_object_detection.py @@ -0,0 +1,248 @@ +from typing import Any, Optional, Union, overload + +from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends +from .base import ChunkPipeline, build_pipeline_init_args + + +if is_vision_available(): + from PIL import Image + + from ..image_utils import load_image, valid_images + +if is_torch_available(): + import torch + + from transformers.modeling_outputs import BaseModelOutput + + from ..models.auto.modeling_auto import MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES + +logger = logging.get_logger(__name__) + + +@add_end_docstrings(build_pipeline_init_args(has_image_processor=True)) +class ZeroShotObjectDetectionPipeline(ChunkPipeline): + """ + Zero shot object detection pipeline using `OwlViTForObjectDetection`. This pipeline predicts bounding boxes of + objects when you provide an image and a set of `candidate_labels`. + + Example: + + ```python + >>> from transformers import pipeline + + >>> detector = pipeline(model="google/owlvit-base-patch32", task="zero-shot-object-detection") + >>> detector( + ... "http://images.cocodataset.org/val2017/000000039769.jpg", + ... candidate_labels=["cat", "couch"], + ... ) + [{'score': 0.287, 'label': 'cat', 'box': {'xmin': 324, 'ymin': 20, 'xmax': 640, 'ymax': 373}}, {'score': 0.254, 'label': 'cat', 'box': {'xmin': 1, 'ymin': 55, 'xmax': 315, 'ymax': 472}}, {'score': 0.121, 'label': 'couch', 'box': {'xmin': 4, 'ymin': 0, 'xmax': 642, 'ymax': 476}}] + + >>> detector( + ... "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", + ... candidate_labels=["head", "bird"], + ... ) + [{'score': 0.119, 'label': 'bird', 'box': {'xmin': 71, 'ymin': 170, 'xmax': 410, 'ymax': 508}}] + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + This object detection pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"zero-shot-object-detection"`. + + See the list of available models on + [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-object-detection). + """ + + _load_processor = False + _load_image_processor = True + _load_feature_extractor = False + _load_tokenizer = True + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + if self.framework == "tf": + raise ValueError(f"The {self.__class__} is only available in PyTorch.") + + requires_backends(self, "vision") + self.check_model_type(MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES) + + @overload + def __call__( + self, image: Union[str, "Image.Image"], candidate_labels: Union[str, list[str]], **kwargs: Any + ) -> list[dict[str, Any]]: ... + + @overload + def __call__(self, image: list[dict[str, Any]], **kwargs: Any) -> list[list[dict[str, Any]]]: ... + + def __call__( + self, + image: Union[str, "Image.Image", list[dict[str, Any]]], + candidate_labels: Optional[Union[str, list[str]]] = None, + **kwargs: Any, + ) -> Union[list[dict[str, Any]], list[list[dict[str, Any]]]]: + """ + Detect objects (bounding boxes & classes) in the image(s) passed as inputs. + + Args: + image (`str`, `PIL.Image` or `list[dict[str, Any]]`): + The pipeline handles three types of images: + + - A string containing an http url pointing to an image + - A string containing a local path to an image + - An image loaded in PIL directly + + You can use this parameter to send directly a list of images, or a dataset or a generator like so: + + ```python + >>> from transformers import pipeline + + >>> detector = pipeline(model="google/owlvit-base-patch32", task="zero-shot-object-detection") + >>> detector( + ... [ + ... { + ... "image": "http://images.cocodataset.org/val2017/000000039769.jpg", + ... "candidate_labels": ["cat", "couch"], + ... }, + ... { + ... "image": "http://images.cocodataset.org/val2017/000000039769.jpg", + ... "candidate_labels": ["cat", "couch"], + ... }, + ... ] + ... ) + [[{'score': 0.287, 'label': 'cat', 'box': {'xmin': 324, 'ymin': 20, 'xmax': 640, 'ymax': 373}}, {'score': 0.25, 'label': 'cat', 'box': {'xmin': 1, 'ymin': 55, 'xmax': 315, 'ymax': 472}}, {'score': 0.121, 'label': 'couch', 'box': {'xmin': 4, 'ymin': 0, 'xmax': 642, 'ymax': 476}}], [{'score': 0.287, 'label': 'cat', 'box': {'xmin': 324, 'ymin': 20, 'xmax': 640, 'ymax': 373}}, {'score': 0.254, 'label': 'cat', 'box': {'xmin': 1, 'ymin': 55, 'xmax': 315, 'ymax': 472}}, {'score': 0.121, 'label': 'couch', 'box': {'xmin': 4, 'ymin': 0, 'xmax': 642, 'ymax': 476}}]] + ``` + + + candidate_labels (`str` or `list[str]` or `list[list[str]]`): + What the model should recognize in the image. + + threshold (`float`, *optional*, defaults to 0.1): + The probability necessary to make a prediction. + + top_k (`int`, *optional*, defaults to None): + The number of top predictions that will be returned by the pipeline. If the provided number is `None` + or higher than the number of predictions available, it will default to the number of predictions. + + timeout (`float`, *optional*, defaults to None): + The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and + the call may block forever. + + + Return: + A list of lists containing prediction results, one list per input image. Each list contains dictionaries + with the following keys: + + - **label** (`str`) -- Text query corresponding to the found object. + - **score** (`float`) -- Score corresponding to the object (between 0 and 1). + - **box** (`dict[str,int]`) -- Bounding box of the detected object in image's original size. It is a + dictionary with `x_min`, `x_max`, `y_min`, `y_max` keys. + """ + if "text_queries" in kwargs: + candidate_labels = kwargs.pop("text_queries") + + if isinstance(image, (str, Image.Image)): + inputs = {"image": image, "candidate_labels": candidate_labels} + elif isinstance(image, (list, tuple)) and valid_images(image): + return list( + super().__call__( + ({"image": img, "candidate_labels": labels} for img, labels in zip(image, candidate_labels)), + **kwargs, + ) + ) + else: + """ + Supports the following format + - {"image": image, "candidate_labels": candidate_labels} + - [{"image": image, "candidate_labels": candidate_labels}] + - Generator and datasets + This is a common pattern in other multimodal pipelines, so we support it here as well. + """ + inputs = image + + results = super().__call__(inputs, **kwargs) + return results + + def _sanitize_parameters(self, **kwargs): + preprocess_params = {} + if "timeout" in kwargs: + preprocess_params["timeout"] = kwargs["timeout"] + postprocess_params = {} + if "threshold" in kwargs: + postprocess_params["threshold"] = kwargs["threshold"] + if "top_k" in kwargs: + postprocess_params["top_k"] = kwargs["top_k"] + return preprocess_params, {}, postprocess_params + + def preprocess(self, inputs, timeout=None): + image = load_image(inputs["image"], timeout=timeout) + candidate_labels = inputs["candidate_labels"] + if isinstance(candidate_labels, str): + candidate_labels = candidate_labels.split(",") + + target_size = torch.tensor([[image.height, image.width]], dtype=torch.int32) + for i, candidate_label in enumerate(candidate_labels): + text_inputs = self.tokenizer(candidate_label, return_tensors=self.framework) + image_features = self.image_processor(image, return_tensors=self.framework) + if self.framework == "pt": + image_features = image_features.to(self.dtype) + yield { + "is_last": i == len(candidate_labels) - 1, + "target_size": target_size, + "candidate_label": candidate_label, + **text_inputs, + **image_features, + } + + def _forward(self, model_inputs): + target_size = model_inputs.pop("target_size") + candidate_label = model_inputs.pop("candidate_label") + is_last = model_inputs.pop("is_last") + + outputs = self.model(**model_inputs) + + model_outputs = {"target_size": target_size, "candidate_label": candidate_label, "is_last": is_last, **outputs} + return model_outputs + + def postprocess(self, model_outputs, threshold=0.1, top_k=None): + results = [] + for model_output in model_outputs: + label = model_output["candidate_label"] + model_output = BaseModelOutput(model_output) + outputs = self.image_processor.post_process_object_detection( + outputs=model_output, threshold=threshold, target_sizes=model_output["target_size"] + )[0] + + for index in outputs["scores"].nonzero(): + score = outputs["scores"][index].item() + box = self._get_bounding_box(outputs["boxes"][index][0]) + + result = {"score": score, "label": label, "box": box} + results.append(result) + + results = sorted(results, key=lambda x: x["score"], reverse=True) + if top_k: + results = results[:top_k] + + return results + + def _get_bounding_box(self, box: "torch.Tensor") -> dict[str, int]: + """ + Turns list [xmin, xmax, ymin, ymax] into dict { "xmin": xmin, ... } + + Args: + box (`torch.Tensor`): Tensor containing the coordinates in corners format. + + Returns: + bbox (`dict[str, int]`): Dict containing the coordinates in corners format. + """ + if self.framework != "pt": + raise ValueError("The ZeroShotObjectDetectionPipeline is only available in PyTorch.") + xmin, ymin, xmax, ymax = box.int().tolist() + bbox = { + "xmin": xmin, + "ymin": ymin, + "xmax": xmax, + "ymax": ymax, + } + return bbox